Important packages:

setwd("Dataset")
Warning: The working directory was changed to C:/Users/DELL/Documents/GitHub/Project_IT326/Dataset inside a notebook chunk. The working directory will be reset when the chunk is finished running. Use the knitr root.dir option in the setup chunk to change the working directory for notebook chunks.
Dataset <- read.csv("data.csv")
Preprocessed_dataset <-  read.csv("preprocessed_dataset.csv") 
if(!require(ggplot2)){
install.packages("ggplot2")}
library(ggplot2)
if(!require(dplyr)){
install.packages("dplyr")}
library(dplyr) 
if(!require(dplyr)){
install.packages("dplyr")}
library(dplyr)
if(!require(ltm)){
install.packages("ltm")}
library(ltm)
if (!require(cluster, quietly = TRUE)) {
  install.packages("cluster")
}
if (!require(factoextra, quietly = TRUE)) {
  install.packages("factoextra")
}
 
if (!require(caret, quietly = TRUE)) {
  install.packages("caret")
}

if (!require(partykit, quietly = TRUE)) {
  install.packages("partykit")
}

if (!require(rpart, quietly = TRUE)) {
  install.packages("rpart")
}
if (!require(rpart.plot, quietly = TRUE)) {
  install.packages("rpart.plot")
}
if (!require(tidyverse, quietly = TRUE)) {
  install.packages("tidyverse")
}

IT326 Project

_________________________________________________________________________________________________________

The Goal

Our primary objective of this analysis is to classify whatever a student will go to college or not using the classification methods and to identify the main factors and reasons why students are less likely to pursue higher education indicated by “will_go_to_college” being ‘False’. By leveraging the provided dataset with attributes such as school type, school accreditation, gender, interest in college, residence, we aim to discover the most influential variables and their relationships with the decision not to attend college.

The Source

Kaggle.com

General information

  • Number of attributes : 11
  • Number of rows (objects) : 1000
  • The class label: The class label in our project is the attribute “will_go_to_college”. This attribute is binary, meaning that it can take on two values: True for yes or False for no. The value of this attribute will be the target variable that we are trying to predict during our project.

Attribute Description Type Possible values
type_school The type of school the student attends Binary Academic / Vocational
school_accreditation The quality if school Binary A / B (A is better than B)
gender The student’s gender Binary Male / Female
interest The student’s interest in going to college Nominal Very interested /Interested / Less Interested / Not Interested /Uncertain
residence The student’s residence Binary Urban / Rural
parent_age The age of the student’s parents Numeric 40 - 65
parent_salary

The monthly salary of the student’s parents in IDR/Rupiah.

[1Rupiah = 0.00024SAR]

Numeric 1000K - 10M
house_area The size of the student’s house in meter square Numeric 20 - 120
average_grades The student’s average grades in school. Numeric 75 - 98
parent_was_in_college Whether the student’s parents attended college. Binary True - False
will_go_to_college Whether the student plans to go to college. Binary True - False

Sample of our data

head(Dataset)

Here are a sample of 6 row from our dataset.

Missing values

sum(is.na(Dataset))
[1] 0

There are no missing values in our dataset.

Statistical graphs

Graph 1:

df =data.frame(Dataset)
ggplot(data=df, aes(x = interest, fill = will_go_to_college)) +
  geom_bar() +
  scale_x_discrete(limits = c('Not Interested', 'Less Interested', 'Uncertain', 'Interested', 'Very Interested')) +
  labs(title = 'College interest vs College attendance ') +
  scale_fill_manual(values = c("True" = "antiquewhite2", "False" = "antiquewhite3")) +
  theme_minimal()

NA
NA

According to the graph, whether students are interested in going to college or not does not affect whether they actually end up attending Collage . There is a group of individuals who were interested in attending but did not receive acceptance, while others who were not interested were accepted.

Graph 2:


filtered_True =filter(Dataset, will_go_to_college == 'True')
filtered_False =subset(Dataset, will_go_to_college =='False')

ggplot() +
  geom_density(data = filtered_True, aes(x = average_grades, fill = "Going to College"), alpha = 0.5) +
  geom_density(data = filtered_False, aes(x = average_grades, fill = "NOT Going to College"), alpha = 0.5) +
  labs(x = "Average Grades", y = "Density") +
  ggtitle("Comparison of Average Grades for Students Going to College and NOT Going to College") +
  scale_fill_manual(values = c("Going to College" = "antiquewhite4", "NOT Going to College" = "antiquewhite1"))

The graph shows that the average grades for students who had accepted to go to college were higher than those who did not enter college , and this indicates the existence of a correlation between those who going to college and the average grades

Graph 3:

Dataset_percentage <- Dataset %>%
  group_by(type_school) %>%
  summarise(percentage = mean(will_go_to_college == "True") * 100)

# Create a percentage chart
ggplot(Dataset_percentage, aes(x = type_school, y = percentage, fill = type_school)) +
  geom_bar(stat = "identity") +
  labs(title = "Percentage of Students Going to College by Type of School",
       x = "Type of School",
       y = "Percentage") +
  scale_fill_manual(values = c("Academic" = "antiquewhite3", "Vocational" = "antiquewhite2")) +
  theme_minimal()

NA
NA

This graph shows the impact of the type of high school attended by students on their college attendance . Based on the bar chart:

  • among students from Academic high schools, 313 are going to college, and 296 are not.

  • among students from Vocational high schools, 187 are going to college, and 204 are not

These information tell us that a higher proportion of students from academic high schools are going to college compared to those from vocational schools which suggest that the type of school attended.

Graph 4:

ggplot(Dataset, aes(x = average_grades)) +
  geom_histogram(binwidth = 5, fill = "antiquewhite2", color = "antiquewhite4") +
  labs(title = "Distribution of students' grades",
       x = "Students' average grades",
       y = "Frequency") +
  theme_minimal()

This histogram show us that the majority of the students in the dataset are performing well since it seems like their grades are spanning between 75 and 98. This analysis will help us determine whether the academic performance level of students is a contributing factor to their college attendance or not.

Statistical Measures

  1. The student’s academic performance analysis:
summary(Dataset$average_grades)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  75.00   83.74   85.58   86.10   88.26   98.00 

The student grades in our dataset range from 75.00 to 98.00, with a median of 85.58 and an average of 86.10. This suggests that most students are doing well as none of them have average grades below 50. However, it’s interesting to note that some students have much higher or lower grades than the average, mainly due to the wide range of grades..

  1. The socioeconomic status of students’ families analysis:
summary(Dataset$parent_salary)
    Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
 1000000  4360000  5440000  5381570  6382500 10000000 

In the dataset, we’ve got students parents with salaries ranging from 1,000,000 to 10,000,000 IDR/Rupiah. The median salary is 5,440,000 IDR/Rupiah, and the average is 5,381,570 IDR/Rupiah. This data tells us that many parents in our dataset earn less than the average salary in Indonesia, which is 146,000,000 IDR. This suggests that quite a few students in our dataset come from families with limited finances. And this financial situation could certainly impact their ability to get through college.

summary(Dataset$house_area)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  20.00   64.60   75.50   74.52   84.83  120.00 

Additionally we can utilize the house area attribute to gain a deeper understanding of the socioeconomic status of students’ families, where students with houses significantly larger than the mean might indicate a higher socioeconomic status, while those with houses considerably smaller than the mean might reflect a comparatively lower socioeconomic status. Based on the shown output, the house areas range from [20.00-120.00 ㎡]. The median house area is 75.50 ㎡ indicates that families with house areas around this value likely have moderate socioeconomic status with houses that neither very small nor very large.

  1. Understanding Parent Age Range and Variation in its Values
summary(Dataset$parent_age)
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
  40.00   50.00   52.00   52.21   54.00   65.00 
SD=sd(Dataset$parent_age)
MeanAge=mean(Dataset$parent_age)
cat("coefficient of variation:",SD/MeanAge*100,"%")
coefficient of variation: 6.704771 %

This summary provides the range for age attribute [40,65] which indicates that all parent in middle age during this age parent have more concern about their children , the coefficient of variation= 6.7% which indicates lower variation ,and the value of attribute parent_agerare are relatively close to the mean overall 25% of them have an age below or equal to 50 , 75% have an age below or equal to 54 and the median value is 52

Outliers analysis

###parent age outliers
quartiles <- quantile(Dataset$parent_age, probs=c(.25, .75), na.rm = FALSE)
IQR <- IQR(Dataset$parent_age)

Lower <- quartiles[1] - 1.5*IQR
Upper <- quartiles[2] + 1.5*IQR 

data_no_outlier <- subset(Dataset, Dataset$parent_age > Lower & Dataset$parent_age < Upper)
dim(data_no_outlier)
[1] 957  11
###parent salary outliers
quartiles <- quantile(Dataset$parent_salary, probs=c(.25, .75), na.rm = FALSE)
IQR <- IQR(Dataset$parent_salary)

Lower <- quartiles[1] - 1.5*IQR
Upper <- quartiles[2] + 1.5*IQR 

data_no_outlier <- subset(data_no_outlier, data_no_outlier$parent_salary> Lower & data_no_outlier$parent_salary < Upper)
dim(data_no_outlier)
[1] 955  11
###averge grades outliers
quartiles <- quantile(Dataset$average_grades, probs=c(.25, .75), na.rm = FALSE)
IQR <- IQR(Dataset$average_grades)

Lower <- quartiles[1] - 1.5*IQR
Upper <- quartiles[2] + 1.5*IQR 

data_no_outlier <- subset(data_no_outlier, data_no_outlier$average_grades> Lower & data_no_outlier$average_grades < Upper)
dim(data_no_outlier)
[1] 944  11
###house area outliers
quartiles <- quantile(Dataset$house_area, probs=c(.25, .75), na.rm = FALSE)
IQR <- IQR(Dataset$house_area)

Lower <- quartiles[1] - 1.5*IQR
Upper <- quartiles[2] + 1.5*IQR 

data_no_outlier <- subset(data_no_outlier, data_no_outlier$house_area> Lower & data_no_outlier$house_area < Upper)

Founded_Outliers=data.frame(anti_join(Dataset,data_no_outlier))
Joining with `by = join_by(type_school, school_accreditation, gender, interest, residence, parent_age, parent_salary, house_area, average_grades, parent_was_in_college, will_go_to_college)`
print(Founded_Outliers)

After conducting data analysis and identifying outliers, our inspection reveals that the detected outliers represent inherent variation within the population. Regarding Parent_age, outliers are observed for values below 44 and above 65. However, it should be noted the age from 40 to 65 fall within the expected mean of our dataset meaning that it doesn’t indicate that they are outliers . For parent_salary, we found two outliers: one below 1,326,250 ind ≈ 85 USD and another above 9,416,250 ind ≈ 606 USD. The minimum and maximum values were determined to be 1,000,000 ind ≈ 64 USD and 10,000,000 ind ≈ 644 USD, respectively. In the case of grades, twelve outliers were identified, ranging from below 76 to above 97. Nevertheless, since the data falls within the acceptable range of 0 to 100, these outliers should be retained as they are still considered normal and within the usual grade range. Finally, for house_area, we found eleven outliers below 34.4m and above 115m, with the minimum being 20m and the maximum being 120m. However, these values are still considered typical for the population.

Normalization

normalize <- function(x) {return((x-min(x))/ (max(x)-min(x)))}
datasetWithoutNormalization<-Dataset
Dataset$parent_salary<-normalize(datasetWithoutNormalization$parent_salary)
Dataset$house_area<-normalize(datasetWithoutNormalization$house_area)
print(Dataset)

We applied normalization to the ‘parent_salary’ and ‘house_area’ attributes, scaling their values to a range between 0 and 1. This normalization process greatly facilitates data handling and analysis, ensuring that these attributes are on a consistent scale. Which will improve the reliability of our data analysis and enable better conclusions to be drawn from the dataset. Normalization is a crucial step in preparing the data for modeling, as it prevents attributes with larger numerical ranges from dominating the analysis and ensures fair treatment for all features.

Discretization



Dataset$average_grades [Dataset$average_grades >= 95] <- '+A'
Dataset$average_grades [95 >Dataset$average_grades & Dataset$average_grades >= 90] <- 'A'
Dataset$average_grades [90 >Dataset$average_grades & Dataset$average_grades >= 85] <- '+B'
Dataset$average_grades [85 >Dataset$average_grades & Dataset$average_grades >= 80] <- 'B'
Dataset$average_grades [80 >Dataset$average_grades & Dataset$average_grades >= 75] <- '+C'
Dataset$average_grades [75 >Dataset$average_grades & Dataset$average_grades >= 70] <- 'C'
Dataset$average_grades [70 >Dataset$average_grades & Dataset$average_grades >= 65] <- '+D'
Dataset$average_grades [65 >Dataset$average_grades & Dataset$average_grades >= 60] <- 'D'
Dataset$average_grades [60 >Dataset$average_grades & Dataset$average_grades >= 0] <- 'F'
Dataset$average_grades <- as.character(Dataset$average_grades )
print(Dataset)

We transformed the parent_age attribute into intervals by dividing the values to be fall on one of two possible interval labels with equal width which is(40,50],(50,60] by discretization the values well be simpler to classify or perform other methods that can help us later in our model.

and to better utilize and interpret the grades attributes for each student, we have converted the numeric grades into letter grades (A+, A, B+, B, C+, C, D+, D, F). This transformation was undertaken to focus on the general letter grade representation rather than the precise numerical values.

Encoding

Dataset$parent_was_in_college[Dataset$parent_was_in_college=="TRUE"]<-1
Dataset$parent_was_in_college[Dataset$parent_was_in_college=="True"]<-1
Dataset$parent_was_in_college[Dataset$parent_was_in_college=="FALSE"]<-0
Dataset$parent_was_in_college[Dataset$parent_was_in_college=="False"]<-0

Dataset$will_go_to_college[Dataset$will_go_to_college=="TRUE"]<-0
Dataset$will_go_to_college[Dataset$will_go_to_college=="True"]<-0
Dataset$will_go_to_college[Dataset$will_go_to_college=="FALSE"]<-1
Dataset$will_go_to_college[Dataset$will_go_to_college=="False"]<-1

Dataset$gender[Dataset$gender=="Female"]<-1
Dataset$gender[Dataset$gender=="Male"]<-0

Dataset$school_accreditation[Dataset$school_accreditation=="A"]<-1
Dataset$school_accreditation[Dataset$school_accreditation=="B"]<-0

Dataset$interest[Dataset$interest=="Very Interested"]<-4
Dataset$interest[Dataset$interest=="Interested"]<-3
Dataset$interest[Dataset$interest=="Less Interested"]<-2
Dataset$interest[Dataset$interest=="Not Interested"]<-1
Dataset$interest[Dataset$interest=="Uncertain"]<-0


Dataset$type_school[Dataset$type_school=="Academic"]<-1 
Dataset$type_school[Dataset$type_school=="Vocational"]<-0

Dataset$residence[Dataset$residence=="Urban"]<-1
Dataset$residence[Dataset$residence=="Rural"]<-0
print(Dataset)

Since encoding is an important step in data preprocessing that enables the use of categorical data in various data analysis and machine learning tasks, we encoded attributes like the ‘parent was in college’ attribute from (True, False) to (1, 0), and ‘will go to college’ from (True, False) to (0, 1). This encoding is carried out as we aim to predict the influencing factors. Additionally, we encoded the ‘gender’ attribute from (Female, Male) to (1, 0), ‘school accreditation’ from (A, B) to (1, 0), ‘type_school’ from (Academic, Vocational) to (1, 0), ‘residence’ from (Urban, Rural) to (1, 0), and ‘interest’ from (Very interested ,Interested , Less Interested , Not Interested ,Uncertain ) to (4,3,2, 1, 0) respectively. Encoding serves to simplify the data, reduce complexity, and enhance its suitability for modeling purposes.

Correlation analysis Chi square test for nominal attribute:


#1
C=chisq.test(Dataset$type_school , Dataset$will_go_to_college)
print(C)

    Pearson's Chi-squared test with Yates' continuity correction

data:  Dataset$type_school and Dataset$will_go_to_college
X-squared = 1.0751, df = 1, p-value = 0.2998
#2

C=chisq.test(Dataset$school_accreditation , Dataset$will_go_to_college)
print(C)

    Pearson's Chi-squared test with Yates' continuity correction

data:  Dataset$school_accreditation and Dataset$will_go_to_college
X-squared = 0.78513, df = 1, p-value = 0.3756
#3
C=chisq.test(Dataset$gender , Dataset$will_go_to_college)
print(C)

    Pearson's Chi-squared test with Yates' continuity correction

data:  Dataset$gender and Dataset$will_go_to_college
X-squared = 1.0249, df = 1, p-value = 0.3114
#4
C=chisq.test(Dataset$interest , Dataset$will_go_to_college)
print(C)

    Pearson's Chi-squared test

data:  Dataset$interest and Dataset$will_go_to_college
X-squared = 73.337, df = 4, p-value = 4.477e-15
#5
C=chisq.test(Dataset$residence , Dataset$will_go_to_college)
print(C)

    Pearson's Chi-squared test with Yates' continuity correction

data:  Dataset$residence and Dataset$will_go_to_college
X-squared = 0.016098, df = 1, p-value = 0.899
#6
C=chisq.test(Dataset$average_grades , Dataset$will_go_to_college)
print(C)

    Pearson's Chi-squared test

data:  Dataset$average_grades and Dataset$will_go_to_college
X-squared = 261.89, df = 4, p-value < 2.2e-16
#7
C=chisq.test(Dataset$parent_was_in_college , Dataset$will_go_to_college)
print(C)

    Pearson's Chi-squared test with Yates' continuity correction

data:  Dataset$parent_was_in_college and Dataset$will_go_to_college
X-squared = 2.1194, df = 1, p-value = 0.1454

All the attributes have X-square greater than the p-value which indicate a some association with the class label; therefore we reject the null hypothesis

we noticed for ‘interest’ and ‘average grade’ the analysis shows that X-square is much larger than p-value indicate the significant association of the two attributes with the decision of the student to go to the collage or not

Correlation coefficient analysis for numeric attribute:


biserial.cor(Dataset$parent_salary,Dataset$will_go_to_college, c("all.obs", "complete.obs"), level = 1)
[1] 0.4756928
biserial.cor(Dataset$house_area,Dataset$will_go_to_college, c("all.obs", "complete.obs"), level = 1)
[1] 0.4672669
biserial.cor(Dataset$parent_age,Dataset$will_go_to_college,c("all.obs", "complete.obs"), level = 1)
[1] 0.04287336
 
 

the analysis shows moderate correlation coefficient for parent salary and house area with the class label which indicate that they are relevant factors meaning that the higher the parent salary and the larger house area the higher probability for a student to enroll in a collage

where is the on other hand, the correlation coefficient for the parent age is very small which indicate that the parent age has little impact to the probability for student to enroll in a collage

Feature selection:

ultimately based on the analysis of the correlation that we conducted on the relationship of the dataset attributes with the class label, and the understanding of the data and the context of each attribute and potential relevance to the class label we decided to not delete any of the attribute

Clustering Analysis:

In this analysis, we apply K-means clustering to the dataset using different values of K. K-means clustering is an unsupervised learning algorithm that partitions the data into K clusters based on similarity. We will explore three different values of K and evaluate the clustering results using various metrics.

Removing the class label and preparing the dataset for Clustering

 
original_data <- Preprocessed_dataset

# Remove any non-numeric attributes
numeric_data <- original_data[, sapply(original_data, is.numeric)]

# Remove the class label 'will_go_to_college'
numeric_data <- numeric_data[, !(names(numeric_data) == 'will_go_to_college')]


# Print the dataset "numeric_data" to make sure it's prepared for clustering
print(numeric_data)

# Scaling the dataset
# numeric_data <- scale(numeric_data)

 

Now, the ‘numeric_dataset’ dataset contains only numeric attributes without the class label, which makes it ready for the clustering process.

K=2

# k-means clustering set a seed for random number generation to make the results reproducible 
set.seed(8953)

# run kmeans clustering to find 2 clusters
kmeans.result <- kmeans(numeric_data, 2)

# visualize clustering
library(factoextra)
fviz_cluster(kmeans.result, data = numeric_data)


# print the clustering result
print(kmeans.result)
K-means clustering with 2 clusters of sizes 531, 469

Cluster means:
  type_school school_accreditation    gender interest residence parent_age parent_salary
1   0.5856874            0.4067797 0.5235405 2.001883 0.3163842   49.67985     0.5219000
2   0.6353945            0.5650320 0.4413646 2.296375 0.7910448   55.07036     0.4471476
  house_area parent_was_in_college
1  0.5351620              0.700565
2  0.5564648              0.315565

Clustering vector:
   [1] 2 2 1 1 2 1 1 2 1 1 2 1 1 2 1 2 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 2 1 1 2 2 1 2 2 1 2 2 1 1
  [46] 1 1 2 2 2 1 2 1 1 1 2 2 1 1 2 1 1 1 1 2 2 1 1 1 2 2 1 1 2 2 2 2 2 1 1 2 1 2 1 2 2 2 1 1 2
  [91] 2 2 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1 2 2 2 1 1 1 1 1 1 1 2 2 2 1 1 1 1 2 2 1 1 1 2 2 1 2 1 1
 [136] 1 2 1 2 2 2 2 1 1 2 1 1 2 1 1 1 1 2 1 2 1 2 1 2 2 2 1 2 2 1 2 2 2 1 1 2 1 2 1 2 1 1 1 1 1
 [181] 2 1 2 1 1 2 2 2 2 2 2 1 1 2 2 2 1 2 2 2 2 2 2 2 1 2 2 1 2 1 1 1 1 2 2 1 2 1 1 1 2 1 2 1 2
 [226] 1 2 1 2 1 2 2 2 1 1 2 2 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 2 1 1 1 2 1 2 2 2 1 1 2 1 2 1 2 1
 [271] 1 1 1 2 1 1 1 2 1 2 1 1 2 1 1 2 2 2 1 1 2 1 2 1 1 2 1 1 1 2 1 2 1 1 1 1 1 2 1 2 1 2 2 2 2
 [316] 2 1 2 2 1 2 2 2 1 1 2 1 2 1 2 2 1 1 2 2 2 1 1 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 1 2 1 1
 [361] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 2 1 2 2 1 2 1 2 2 1 2 2 1 2 1 2 1 1 1 2 1 1 1 1 2 2 1 2 1
 [406] 2 2 1 2 1 1 2 1 2 1 2 2 1 2 2 1 1 2 2 2 2 1 2 2 2 1 1 1 2 2 1 2 1 2 1 2 1 2 2 2 1 1 1 1 2
 [451] 1 1 1 2 1 2 1 1 2 1 1 2 2 1 1 1 1 1 2 1 2 1 2 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1 1 1 1 1 2 1
 [496] 2 1 2 1 2 2 2 2 1 2 2 2 1 2 2 1 1 1 2 2 2 2 1 1 2 1 2 2 1 1 1 2 1 1 1 1 2 1 1 2 2 1 1 1 1
 [541] 1 1 1 2 1 1 1 1 2 2 1 2 2 2 2 1 1 1 1 1 1 2 2 1 2 1 2 2 1 2 2 2 2 1 1 2 2 1 1 2 1 1 2 1 2
 [586] 2 1 1 1 2 1 2 2 2 1 1 2 2 1 2 2 2 1 2 2 1 2 1 2 1 1 1 1 1 2 2 1 1 2 2 1 1 2 1 2 2 1 1 2 2
 [631] 1 1 2 1 2 2 1 1 1 2 1 2 1 2 2 2 2 1 1 2 2 2 2 1 1 1 2 2 2 1 2 2 1 2 1 1 1 1 2 1 1 1 1 2 2
 [676] 1 2 2 2 2 1 2 1 2 2 2 1 1 1 2 1 1 1 1 2 1 1 2 1 1 2 1 2 1 1 1 1 2 1 2 1 1 2 1 1 2 2 2 1 1
 [721] 1 2 2 1 2 2 1 1 2 1 2 1 2 1 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 2 1 1 2 2 1 1 1 1 2 1 2 2 1 2 2
 [766] 2 2 2 1 2 2 2 1 1 1 1 2 1 2 1 1 2 2 2 2 2 2 1 1 2 1 2 2 2 2 1 1 2 2 1 1 1 2 1 2 2 1 2 2 1
 [811] 1 2 1 1 1 1 1 1 1 2 2 2 2 1 2 1 1 2 1 2 1 1 2 2 1 1 1 2 1 1 2 1 2 2 2 1 1 2 2 1 1 2 1 2 2
 [856] 2 1 2 1 1 2 2 2 2 1 2 2 1 1 2 2 1 1 2 2 1 2 2 1 2 1 2 2 1 2 2 1 2 2 1 1 2 1 2 1 2 2 1 1 1
 [901] 1 1 2 2 2 1 1 1 2 1 2 2 1 2 2 2 1 1 2 1 1 2 2 1 1 1 2 2 2 2 1 1 2 2 1 1 2 2 1 2 2 2 2 1 1
 [946] 1 2 2 2 2 2 1 1 1 1 1 2 2 1 2 1 1 1 2 2 1 1 1 1 2 2 2 1 2 2 1 2 1 2 1 1 1 1 1 2 2 2 1 2 1
 [991] 2 1 2 2 1 1 1 1 2 1

Within cluster sum of squares by cluster:
[1] 5022.712 3646.347
 (between_SS / total_SS =  45.9 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"   
[7] "size"         "iter"         "ifault"      

The Silhouette coefficient

#average for each cluster 
avg_sil <- silhouette(kmeans.result$cluster, dist(numeric_data)) 

#k-means clustering with estimating k and initializations 
fviz_silhouette(avg_sil)
NA

The total within-cluster sum of squares

# Calculate total within-cluster sum of squares
total_withinss <- kmeans.result$tot.withinss
cat("Total Within-Cluster Sum of Squares:", sum(total_withinss), "\n")
Total Within-Cluster Sum of Squares: 8669.059 
true_labels <- c(1, 1, 2, 1, 2, 2, 3, 3, 4, 4)  # Adjust based on your actual true labels

cluster_assignments <- kmeans.result$cluster
 

BCubed recall and precision


# Calculate BCubed precision
precision <- 0
for (i in unique(true_labels)) {
  cluster_indices <- which(true_labels == i)
  precision <- precision + sum((table(cluster_assignments[cluster_indices]) * (table(cluster_assignments[cluster_indices]) - 1)) / sum(table(cluster_assignments[cluster_indices])))
}
precision <- precision / sum(table(cluster_assignments))

# Calculate BCubed recall
recall <- 0
for (j in unique(cluster_assignments)) {
  cluster_indices <- which(cluster_assignments == j)
  recall <- recall + sum((table(true_labels[cluster_indices]) * (table(true_labels[cluster_indices]) - 1)) / sum(table(true_labels[cluster_indices])))
}
recall <- recall / sum(table(true_labels))

cat("BCubed Precision:", precision, "\n")
BCubed Precision: 0.002333333 
cat("BCubed Recall:", recall, "\n")
BCubed Recall: 0.1166667 

K=2

# k-means clustering set a seed for random number generation to make the results reproducible 
set.seed(8953)

# run kmeans clustering to find 2 clusters
kmeans.result <- kmeans(numeric_data, 2)

# visualize clustering
library(factoextra)
fviz_cluster(kmeans.result, data = numeric_data)


# print the clustering result
print(kmeans.result)
K-means clustering with 2 clusters of sizes 531, 469

Cluster means:
  type_school school_accreditation    gender interest residence parent_age parent_salary
1   0.5856874            0.4067797 0.5235405 2.001883 0.3163842   49.67985     0.5219000
2   0.6353945            0.5650320 0.4413646 2.296375 0.7910448   55.07036     0.4471476
  house_area parent_was_in_college
1  0.5351620              0.700565
2  0.5564648              0.315565

Clustering vector:
   [1] 2 2 1 1 2 1 1 2 1 1 2 1 1 2 1 2 1 1 1 1 1 2 1 2 2 1 2 2 1 1 1 2 2 1 1 2 2 1 2 2 1 2 2 1 1
  [46] 1 1 2 2 2 1 2 1 1 1 2 2 1 1 2 1 1 1 1 2 2 1 1 1 2 2 1 1 2 2 2 2 2 1 1 2 1 2 1 2 2 2 1 1 2
  [91] 2 2 2 1 1 1 1 2 1 1 1 2 1 1 1 2 1 2 2 2 1 1 1 1 1 1 1 2 2 2 1 1 1 1 2 2 1 1 1 2 2 1 2 1 1
 [136] 1 2 1 2 2 2 2 1 1 2 1 1 2 1 1 1 1 2 1 2 1 2 1 2 2 2 1 2 2 1 2 2 2 1 1 2 1 2 1 2 1 1 1 1 1
 [181] 2 1 2 1 1 2 2 2 2 2 2 1 1 2 2 2 1 2 2 2 2 2 2 2 1 2 2 1 2 1 1 1 1 2 2 1 2 1 1 1 2 1 2 1 2
 [226] 1 2 1 2 1 2 2 2 1 1 2 2 1 1 2 2 1 1 1 2 2 1 1 1 2 2 1 1 2 1 1 1 2 1 2 2 2 1 1 2 1 2 1 2 1
 [271] 1 1 1 2 1 1 1 2 1 2 1 1 2 1 1 2 2 2 1 1 2 1 2 1 1 2 1 1 1 2 1 2 1 1 1 1 1 2 1 2 1 2 2 2 2
 [316] 2 1 2 2 1 2 2 2 1 1 2 1 2 1 2 2 1 1 2 2 2 1 1 2 1 1 1 1 1 1 1 1 2 1 2 1 1 1 1 2 1 1 2 1 1
 [361] 1 1 1 1 1 1 1 1 1 2 2 2 2 2 1 1 2 1 2 2 1 2 1 2 2 1 2 2 1 2 1 2 1 1 1 2 1 1 1 1 2 2 1 2 1
 [406] 2 2 1 2 1 1 2 1 2 1 2 2 1 2 2 1 1 2 2 2 2 1 2 2 2 1 1 1 2 2 1 2 1 2 1 2 1 2 2 2 1 1 1 1 2
 [451] 1 1 1 2 1 2 1 1 2 1 1 2 2 1 1 1 1 1 2 1 2 1 2 2 1 1 1 1 1 1 2 1 1 1 2 1 2 2 1 1 1 1 1 2 1
 [496] 2 1 2 1 2 2 2 2 1 2 2 2 1 2 2 1 1 1 2 2 2 2 1 1 2 1 2 2 1 1 1 2 1 1 1 1 2 1 1 2 2 1 1 1 1
 [541] 1 1 1 2 1 1 1 1 2 2 1 2 2 2 2 1 1 1 1 1 1 2 2 1 2 1 2 2 1 2 2 2 2 1 1 2 2 1 1 2 1 1 2 1 2
 [586] 2 1 1 1 2 1 2 2 2 1 1 2 2 1 2 2 2 1 2 2 1 2 1 2 1 1 1 1 1 2 2 1 1 2 2 1 1 2 1 2 2 1 1 2 2
 [631] 1 1 2 1 2 2 1 1 1 2 1 2 1 2 2 2 2 1 1 2 2 2 2 1 1 1 2 2 2 1 2 2 1 2 1 1 1 1 2 1 1 1 1 2 2
 [676] 1 2 2 2 2 1 2 1 2 2 2 1 1 1 2 1 1 1 1 2 1 1 2 1 1 2 1 2 1 1 1 1 2 1 2 1 1 2 1 1 2 2 2 1 1
 [721] 1 2 2 1 2 2 1 1 2 1 2 1 2 1 1 1 1 2 1 2 1 1 1 1 2 1 2 2 1 2 1 1 2 2 1 1 1 1 2 1 2 2 1 2 2
 [766] 2 2 2 1 2 2 2 1 1 1 1 2 1 2 1 1 2 2 2 2 2 2 1 1 2 1 2 2 2 2 1 1 2 2 1 1 1 2 1 2 2 1 2 2 1
 [811] 1 2 1 1 1 1 1 1 1 2 2 2 2 1 2 1 1 2 1 2 1 1 2 2 1 1 1 2 1 1 2 1 2 2 2 1 1 2 2 1 1 2 1 2 2
 [856] 2 1 2 1 1 2 2 2 2 1 2 2 1 1 2 2 1 1 2 2 1 2 2 1 2 1 2 2 1 2 2 1 2 2 1 1 2 1 2 1 2 2 1 1 1
 [901] 1 1 2 2 2 1 1 1 2 1 2 2 1 2 2 2 1 1 2 1 1 2 2 1 1 1 2 2 2 2 1 1 2 2 1 1 2 2 1 2 2 2 2 1 1
 [946] 1 2 2 2 2 2 1 1 1 1 1 2 2 1 2 1 1 1 2 2 1 1 1 1 2 2 2 1 2 2 1 2 1 2 1 1 1 1 1 2 2 2 1 2 1
 [991] 2 1 2 2 1 1 1 1 2 1

Within cluster sum of squares by cluster:
[1] 5022.712 3646.347
 (between_SS / total_SS =  45.9 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"   
[7] "size"         "iter"         "ifault"      

The Silhouette coefficient

#average for each cluster 
avg_sil <- silhouette(kmeans.result$cluster, dist(numeric_data)) 

#k-means clustering with estimating k and initializations 
fviz_silhouette(avg_sil)
NA

The total within-cluster sum of squares

# Calculate total within-cluster sum of squares
total_withinss <- kmeans.result$tot.withinss
cat("Total Within-Cluster Sum of Squares:", sum(total_withinss), "\n")
Total Within-Cluster Sum of Squares: 8669.059 
true_labels <- c(1, 1, 2, 1, 2, 2, 3, 3, 4, 4)  # Adjust based on your actual true labels

cluster_assignments <- kmeans.result$cluster
 

BCubed recall and precision


# Calculate BCubed precision
precision <- 0
for (i in unique(true_labels)) {
  cluster_indices <- which(true_labels == i)
  precision <- precision + sum((table(cluster_assignments[cluster_indices]) * (table(cluster_assignments[cluster_indices]) - 1)) / sum(table(cluster_assignments[cluster_indices])))
}
precision <- precision / sum(table(cluster_assignments))

# Calculate BCubed recall
recall <- 0
for (j in unique(cluster_assignments)) {
  cluster_indices <- which(cluster_assignments == j)
  recall <- recall + sum((table(true_labels[cluster_indices]) * (table(true_labels[cluster_indices]) - 1)) / sum(table(true_labels[cluster_indices])))
}
recall <- recall / sum(table(true_labels))

cat("BCubed Precision:", precision, "\n")
BCubed Precision: 0.002333333 
cat("BCubed Recall:", recall, "\n")
BCubed Recall: 0.1166667 

K=6

# k-means clustering set a seed for random number generation to make the results reproducible 
set.seed(8953)

# run kmeans clustering to find 6 clusters
kmeans.result <- kmeans(numeric_data, 6)

# visualize clustering
library(factoextra)
fviz_cluster(kmeans.result, data = numeric_data)


# print the clustering result
print(kmeans.result)
K-means clustering with 6 clusters of sizes 104, 91, 213, 233, 239, 120

Cluster means:
  type_school school_accreditation    gender  interest residence parent_age parent_salary
1   0.5288462            0.5288462 0.4134615 0.8365385 0.6442308   55.15385     0.4498611
2   0.7362637            0.6813187 0.4725275 2.3846154 1.0000000   58.60440     0.3728205
3   0.6666667            0.4741784 0.5258216 3.5399061 0.4835681   50.52113     0.5166093
4   0.6952790            0.5536481 0.4420601 3.2703863 0.8583691   54.01717     0.4693371
5   0.5271967            0.3849372 0.4937238 0.5020921 0.2677824   51.26778     0.5096885
6   0.4750000            0.3500000 0.5500000 1.6666667 0.1166667   46.15833     0.5410000
  house_area parent_was_in_college
1  0.5810192             0.4326923
2  0.5303736             0.0989011
3  0.5257512             0.5446009
4  0.5510987             0.2746781
5  0.5787364             0.7740586
6  0.4812833             0.8416667

Clustering vector:
   [1] 1 2 3 3 2 6 3 4 5 6 1 6 6 4 3 4 3 3 6 6 3 4 3 2 4 5 4 5 5 5 3 4 2 5 3 4 1 3 4 5 5 4 4 3 3
  [46] 5 5 4 1 5 6 1 5 3 5 4 1 6 6 4 5 3 5 3 4 4 5 3 5 4 4 5 6 2 1 4 1 4 5 3 4 5 2 5 4 4 1 5 3 4
  [91] 1 4 2 5 6 3 6 4 3 5 5 4 5 3 3 5 3 4 2 2 5 5 5 3 3 6 5 4 2 1 3 6 3 6 1 4 6 3 5 4 4 6 5 3 5
 [136] 5 4 3 4 4 4 2 6 6 5 3 5 1 6 3 6 6 2 5 1 3 1 5 1 4 1 3 4 4 5 5 1 2 3 3 1 5 2 3 4 3 3 5 3 5
 [181] 4 3 1 5 6 1 1 4 4 4 4 6 5 4 1 4 6 5 1 4 1 2 4 4 3 1 2 6 4 3 6 5 6 4 1 3 4 3 5 3 2 3 2 5 5
 [226] 5 5 6 4 5 1 5 5 5 5 2 1 5 3 2 1 3 5 5 4 4 6 5 5 1 2 3 5 4 5 6 3 4 6 4 2 4 6 3 4 5 2 5 4 5
 [271] 6 5 6 4 5 6 5 4 3 1 5 3 2 3 3 4 1 4 5 3 2 5 2 3 3 1 5 3 6 5 3 4 6 5 5 5 6 5 6 1 3 4 5 4 4
 [316] 4 3 1 1 6 2 2 4 6 5 4 6 4 6 4 4 3 5 2 4 1 6 5 5 5 5 5 3 3 5 6 5 4 3 4 3 5 5 6 1 6 3 2 5 6
 [361] 3 6 5 5 5 6 3 3 6 1 2 2 2 4 3 3 1 5 4 4 3 4 3 1 4 3 4 2 3 1 5 2 6 3 3 2 3 5 3 6 4 4 5 4 6
 [406] 4 4 5 4 3 3 4 5 2 3 1 5 3 5 4 3 6 4 1 2 2 6 1 1 4 6 3 6 2 4 6 4 6 4 3 5 5 2 2 4 6 5 6 5 4
 [451] 3 3 5 4 3 4 3 3 4 5 5 2 2 3 3 5 3 3 4 3 4 6 4 4 5 3 5 6 5 6 1 6 3 5 2 3 4 4 3 5 6 5 3 4 3
 [496] 4 6 2 6 4 4 4 4 5 1 4 4 6 2 4 6 3 5 2 1 4 1 3 6 1 5 2 1 3 5 5 4 5 6 6 6 4 3 3 2 4 3 3 6 3
 [541] 3 5 3 4 3 3 5 5 1 1 5 2 4 4 5 6 3 6 5 3 6 4 4 6 5 3 4 2 3 4 2 4 4 5 3 4 4 5 5 4 5 5 4 6 5
 [586] 4 3 5 5 4 6 1 1 4 5 6 1 4 3 4 4 1 3 4 1 3 4 5 4 3 3 5 5 6 2 1 3 5 2 5 3 5 1 6 1 2 3 3 2 4
 [631] 5 5 2 5 4 4 5 3 5 2 3 2 5 4 5 1 4 5 3 1 1 4 2 3 3 6 4 5 4 5 2 4 3 2 3 3 6 3 4 5 3 5 5 4 2
 [676] 3 1 2 1 4 5 4 6 4 1 2 3 3 3 4 5 3 6 3 5 3 5 5 5 3 4 3 5 6 5 5 3 4 5 4 3 3 2 6 3 4 1 2 5 3
 [721] 3 2 4 3 1 2 6 6 4 3 1 3 4 3 6 6 3 4 3 5 6 5 6 5 4 3 1 4 6 5 5 5 5 1 3 5 3 3 4 5 1 1 3 4 4
 [766] 5 2 2 5 4 4 4 5 5 3 3 5 6 2 5 5 4 5 5 4 1 4 5 3 5 5 1 1 1 5 3 6 4 4 3 3 5 4 5 4 4 5 4 1 5
 [811] 6 2 6 3 6 3 6 5 5 4 4 4 4 5 2 3 5 1 3 2 3 5 1 4 6 5 5 1 5 6 4 5 1 1 1 3 3 1 4 5 3 4 6 4 2
 [856] 4 6 4 3 3 4 4 4 2 5 4 2 5 3 1 4 3 6 4 4 3 4 1 3 4 5 2 1 3 2 1 5 4 4 6 6 4 5 4 6 1 4 3 5 5
 [901] 3 5 4 2 4 6 3 5 2 5 4 1 3 5 2 5 5 5 4 3 3 1 4 5 5 5 4 4 4 1 6 3 1 4 6 5 4 2 3 1 1 1 4 3 5
 [946] 3 1 4 4 4 2 3 3 3 5 5 1 1 5 1 5 3 6 4 2 3 5 5 5 5 4 2 5 4 4 6 4 3 2 5 5 3 3 6 2 4 2 6 4 3
 [991] 4 3 4 4 5 3 5 3 5 3

Within cluster sum of squares by cluster:
[1] 285.6366 426.7825 722.6048 655.5483 782.3111 828.9997
 (between_SS / total_SS =  76.9 %)

Available components:

[1] "cluster"      "centers"      "totss"        "withinss"     "tot.withinss" "betweenss"   
[7] "size"         "iter"         "ifault"      
library(cluster) 
#average for each cluster 
avg_sil <- silhouette(kmeans.result$cluster,dist(numeric_data)) 

#k-means clustering with estimating k and initializations 
fviz_silhouette(avg_sil)

The optimal number of cluster

To find the optimal number of clusters to use in the k-means algorithm, we’ll use the fviz_nbclust() function from the factoextrapackage to create a plot of the number of clusters vs. the total within sum of squares

# Function to calculate total within-cluster sum of squares (wss)
wss <- function(k) {
  kmeans_result <- kmeans(numeric_data, centers = k, nstart = 10)  # You can adjust nstart based on your preference
  return(sum(kmeans_result$tot.withinss))
}

# Calculate the total within-cluster sum of squares for different values of k
k_values <- 1:10  # You can adjust the range of k values
wss_values <- sapply(k_values, wss)

# Plot the elbow curve
plot(k_values, wss_values, type = "b", pch = 19, frame = FALSE, 
     xlab = "Number of Clusters (k)", ylab = "Total Within-Cluster Sum of Squares (WSS)",
     main = "Elbow Method")

# Adding a line to indicate the "elbow"
abline(v = which(diff(wss_values) == max(diff(wss_values))) + 1, col = "red")

NA
NA
gap_stat <- clusGap(numeric_data,
                    FUN = kmeans,
                    nstart = 25,
                    K.max = 10,
                    B = 50)
Clustering k = 1,2,..., K.max (= 10): .. done
Bootstrapping, b = 1,2,..., B (= 50)  [one "." per sample]:
..........................
Warning: did not converge in 10 iterations
........................ 50 
#plot number of clusters vs. gap statistic
fviz_gap_stat(gap_stat)

Classification:

factor the data


data <- Preprocessed_dataset

data$will_go_to_college <- factor(data$will_go_to_college, levels = c("1", "0"), labels = c("1", "0"))
data$residence <- factor(data$residence, levels = c("1", "0"), labels = c("1", "0"))
data$gender <- factor(data$gender, levels = c("1", "0"), labels = c("1", "0"))
data$parent_was_in_college <- factor(data$parent_was_in_college, levels = c("1", "0"), labels = c("1", "0"))
data$interest <- factor(data$interest, levels = c("4","3","2","1", "0"), labels = c("4","3","2","1", "0"))
data$type_school <- factor(data$type_school, levels = c("1", "0"), labels = c("1", "0"))
data$school_accreditation <- factor(data$school_accreditation, levels = c("1", "0"), labels = c("1", "0"))

data$average_grades <- factor(data$average_grades, levels = c("+A", "A","+B","B","+C","C","+D","D","F"), labels = c("+A", "A","+B","B","+C","C","+D","D","F"))
str(data)
'data.frame':   1000 obs. of  11 variables:
 $ type_school          : Factor w/ 2 levels "1","0": 1 1 1 2 1 2 1 1 1 1 ...
 $ school_accreditation : Factor w/ 2 levels "1","0": 1 1 2 2 1 2 1 2 2 2 ...
 $ gender               : Factor w/ 2 levels "1","0": 2 2 1 2 1 1 2 2 1 1 ...
 $ interest             : Factor w/ 5 levels "4","3","2","1",..: 3 3 1 1 1 3 1 1 5 1 ...
 $ residence            : Factor w/ 2 levels "1","0": 1 1 1 2 1 2 2 2 2 2 ...
 $ parent_age           : int  56 57 50 49 57 48 52 53 52 47 ...
 $ parent_salary        : num  0.661 0.379 0.611 0.622 0.472 ...
 $ house_area           : num  0.63 0.568 0.606 0.582 0.551 0.453 0.655 0.633 0.603 0.48 ...
 $ average_grades       : Factor w/ 9 levels "+A","A","+B",..: 4 3 3 4 3 3 2 4 3 3 ...
 $ parent_was_in_college: Factor w/ 2 levels "1","0": 2 2 2 1 2 1 1 1 1 1 ...
 $ will_go_to_college   : Factor w/ 2 levels "1","0": 2 2 2 2 1 1 2 1 2 1 ...

balanced or imbalanced


library(tidyverse) 
library(caret)
data$will_go_to_college<- as.numeric(data$will_go_to_college)
hist(data$will_go_to_college,col="coral")

prop.table(table(data$will_go_to_college))

  1   2 
0.5 0.5 

we want to confirm that the distribution between the two label data is not too much different. Because imbalanced datasets can lead to imbalanced accuracy.

Fortunately ,our data is balanced

partition method

We opted for cross-validation as our partition method owing to the constraints posed by limited data availability. To ensure robustness in our evaluation, we employed three distinct values for k folds 2, 3, and 4. we chose small k folds because of our small data

c4.5

library(caret)
library(rpart)
library(dplyr)
library(rpart.plot)


set.seed(123)
fold <- c(2, 3, 4)  # Values of xval (number of folds) to try

for (fold in fold) {
  cat("fold =", fold, "\n")
  trctrl <- trainControl(method = "cv", number = fold, savePredictions = TRUE)
  c45_fit <- train(will_go_to_college ~ ., data = data, method = "J48", trControl = trctrl)
   print(c45_fit$finalModel)
  # Get predicted values
  predictions <- predict(c45_fit, newdata = data)
  
  # Create confusion matrix
  confusion_matrix <- confusionMatrix(predictions, data$will_go_to_college)
  
  # Print the confusion matrix
  print(confusion_matrix)
  
  # Print accuracy for each fold
  pred <- c45_fit$pred
  pred$equal <- ifelse(pred$pred == pred$obs, 1, 0)
  eachfold <- pred %>%
    group_by(Resample) %>%
    summarise_at(vars(equal),
                 list(Accuracy = mean))
  print(eachfold)
    # Plot decision tree
plot(c45_fit$finalModel, main = fold)
  
}
fold = 2 
J48 pruned tree
------------------

average_gradesA <= 0
|   parent_salary <= 0.416667
|   |   house_area <= 0.559
|   |   |   interest0 <= 0: 1 (187.0/3.0)
|   |   |   interest0 > 0
|   |   |   |   house_area <= 0.383
|   |   |   |   |   residence0 <= 0
|   |   |   |   |   |   parent_age <= 58: 0 (3.0)
|   |   |   |   |   |   parent_age > 58: 1 (1.0)
|   |   |   |   |   residence0 > 0: 1 (1.0)
|   |   |   |   house_area > 0.383: 1 (15.0)
|   |   house_area > 0.559
|   |   |   parent_was_in_college0 <= 0
|   |   |   |   type_school0 <= 0
|   |   |   |   |   average_grades+B <= 0
|   |   |   |   |   |   average_gradesB <= 0
|   |   |   |   |   |   |   interest2 <= 0: 0 (5.0)
|   |   |   |   |   |   |   interest2 > 0: 1 (1.0)
|   |   |   |   |   |   average_gradesB > 0: 1 (9.0)
|   |   |   |   |   average_grades+B > 0: 1 (12.0)
|   |   |   |   type_school0 > 0: 1 (28.0)
|   |   |   parent_was_in_college0 > 0
|   |   |   |   parent_salary <= 0.368889
|   |   |   |   |   average_grades+C <= 0
|   |   |   |   |   |   parent_age <= 60: 1 (19.0/2.0)
|   |   |   |   |   |   parent_age > 60: 0 (2.0)
|   |   |   |   |   average_grades+C > 0: 0 (5.0)
|   |   |   |   parent_salary > 0.368889: 0 (20.0/1.0)
|   parent_salary > 0.416667
|   |   house_area <= 0.413
|   |   |   interest3 <= 0
|   |   |   |   type_school0 <= 0
|   |   |   |   |   school_accreditation0 <= 0: 1 (26.0)
|   |   |   |   |   school_accreditation0 > 0
|   |   |   |   |   |   parent_salary <= 0.524444: 1 (6.0)
|   |   |   |   |   |   parent_salary > 0.524444: 0 (2.0)
|   |   |   |   type_school0 > 0
|   |   |   |   |   gender0 <= 0
|   |   |   |   |   |   interest0 <= 0: 0 (6.0)
|   |   |   |   |   |   interest0 > 0
|   |   |   |   |   |   |   house_area <= 0.348: 0 (3.0)
|   |   |   |   |   |   |   house_area > 0.348: 1 (2.0)
|   |   |   |   |   gender0 > 0
|   |   |   |   |   |   school_accreditation0 <= 0
|   |   |   |   |   |   |   parent_was_in_college0 <= 0
|   |   |   |   |   |   |   |   parent_age <= 51: 0 (3.0)
|   |   |   |   |   |   |   |   parent_age > 51: 1 (1.0)
|   |   |   |   |   |   |   parent_was_in_college0 > 0
|   |   |   |   |   |   |   |   parent_age <= 53: 1 (9.0)
|   |   |   |   |   |   |   |   parent_age > 53: 0 (2.0)
|   |   |   |   |   |   school_accreditation0 > 0: 1 (3.0)
|   |   |   interest3 > 0: 1 (28.0)
|   |   house_area > 0.413
|   |   |   average_gradesB <= 0
|   |   |   |   type_school0 <= 0
|   |   |   |   |   school_accreditation0 <= 0
|   |   |   |   |   |   residence0 <= 0
|   |   |   |   |   |   |   parent_salary <= 0.503333
|   |   |   |   |   |   |   |   interest2 <= 0
|   |   |   |   |   |   |   |   |   house_area <= 0.665
|   |   |   |   |   |   |   |   |   |   parent_age <= 53
|   |   |   |   |   |   |   |   |   |   |   parent_salary <= 0.496667: 0 (4.0)
|   |   |   |   |   |   |   |   |   |   |   parent_salary > 0.496667: 1 (1.0)
|   |   |   |   |   |   |   |   |   |   parent_age > 53: 1 (5.0)
|   |   |   |   |   |   |   |   |   house_area > 0.665: 0 (4.0)
|   |   |   |   |   |   |   |   interest2 > 0: 0 (5.0)
|   |   |   |   |   |   |   parent_salary > 0.503333: 0 (41.0/1.0)
|   |   |   |   |   |   residence0 > 0
|   |   |   |   |   |   |   interest2 <= 0
|   |   |   |   |   |   |   |   interest0 <= 0
|   |   |   |   |   |   |   |   |   gender0 <= 0: 0 (1.0)
|   |   |   |   |   |   |   |   |   gender0 > 0
|   |   |   |   |   |   |   |   |   |   interest1 <= 0
|   |   |   |   |   |   |   |   |   |   |   parent_age <= 54: 1 (3.0)
|   |   |   |   |   |   |   |   |   |   |   parent_age > 54: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   interest1 > 0: 0 (1.0)
|   |   |   |   |   |   |   |   interest0 > 0: 0 (9.0)
|   |   |   |   |   |   |   interest2 > 0: 1 (2.0)
|   |   |   |   |   school_accreditation0 > 0: 0 (95.0/2.0)
|   |   |   |   type_school0 > 0
|   |   |   |   |   interest2 <= 0
|   |   |   |   |   |   interest3 <= 0
|   |   |   |   |   |   |   residence0 <= 0
|   |   |   |   |   |   |   |   interest0 <= 0: 0 (14.0)
|   |   |   |   |   |   |   |   interest0 > 0
|   |   |   |   |   |   |   |   |   house_area <= 0.462: 0 (3.0)
|   |   |   |   |   |   |   |   |   house_area > 0.462
|   |   |   |   |   |   |   |   |   |   parent_age <= 52: 0 (2.0)
|   |   |   |   |   |   |   |   |   |   parent_age > 52: 1 (4.0/1.0)
|   |   |   |   |   |   |   residence0 > 0
|   |   |   |   |   |   |   |   school_accreditation0 <= 0
|   |   |   |   |   |   |   |   |   parent_salary <= 0.495556: 1 (7.0)
|   |   |   |   |   |   |   |   |   parent_salary > 0.495556
|   |   |   |   |   |   |   |   |   |   parent_age <= 48: 1 (3.0)
|   |   |   |   |   |   |   |   |   |   parent_age > 48
|   |   |   |   |   |   |   |   |   |   |   gender0 <= 0: 0 (15.0)
|   |   |   |   |   |   |   |   |   |   |   gender0 > 0
|   |   |   |   |   |   |   |   |   |   |   |   house_area <= 0.618
|   |   |   |   |   |   |   |   |   |   |   |   |   parent_was_in_college0 <= 0
|   |   |   |   |   |   |   |   |   |   |   |   |   |   interest1 <= 0: 0 (4.0/1.0)
|   |   |   |   |   |   |   |   |   |   |   |   |   |   interest1 > 0: 1 (1.0)
|   |   |   |   |   |   |   |   |   |   |   |   |   parent_was_in_college0 > 0
|   |   |   |   |   |   |   |   |   |   |   |   |   |   interest1 <= 0: 1 (4.0)
|   |   |   |   |   |   |   |   |   |   |   |   |   |   interest1 > 0: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   |   |   house_area > 0.618: 0 (9.0)
|   |   |   |   |   |   |   |   school_accreditation0 > 0
|   |   |   |   |   |   |   |   |   interest0 <= 0
|   |   |   |   |   |   |   |   |   |   parent_salary <= 0.587778
|   |   |   |   |   |   |   |   |   |   |   average_grades+B <= 0: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   |   average_grades+B > 0
|   |   |   |   |   |   |   |   |   |   |   |   house_area <= 0.47: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   |   |   house_area > 0.47: 1 (11.0)
|   |   |   |   |   |   |   |   |   |   parent_salary > 0.587778: 0 (8.0/2.0)
|   |   |   |   |   |   |   |   |   interest0 > 0
|   |   |   |   |   |   |   |   |   |   parent_salary <= 0.676667: 1 (24.0/2.0)
|   |   |   |   |   |   |   |   |   |   parent_salary > 0.676667
|   |   |   |   |   |   |   |   |   |   |   house_area <= 0.5: 1 (1.0)
|   |   |   |   |   |   |   |   |   |   |   house_area > 0.5: 0 (3.0)
|   |   |   |   |   |   interest3 > 0: 0 (7.0)
|   |   |   |   |   interest2 > 0: 0 (24.0)
|   |   |   average_gradesB > 0
|   |   |   |   residence0 <= 0
|   |   |   |   |   parent_was_in_college0 <= 0: 0 (9.0)
|   |   |   |   |   parent_was_in_college0 > 0
|   |   |   |   |   |   interest1 <= 0
|   |   |   |   |   |   |   school_accreditation0 <= 0
|   |   |   |   |   |   |   |   parent_age <= 57
|   |   |   |   |   |   |   |   |   interest3 <= 0
|   |   |   |   |   |   |   |   |   |   house_area <= 0.486
|   |   |   |   |   |   |   |   |   |   |   type_school0 <= 0
|   |   |   |   |   |   |   |   |   |   |   |   parent_salary <= 0.63: 1 (3.0)
|   |   |   |   |   |   |   |   |   |   |   |   parent_salary > 0.63: 0 (3.0)
|   |   |   |   |   |   |   |   |   |   |   type_school0 > 0
|   |   |   |   |   |   |   |   |   |   |   |   parent_age <= 54
|   |   |   |   |   |   |   |   |   |   |   |   |   parent_salary <= 0.458889: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   |   |   |   parent_salary > 0.458889: 1 (3.0)
|   |   |   |   |   |   |   |   |   |   |   |   parent_age > 54: 0 (2.0)
|   |   |   |   |   |   |   |   |   |   house_area > 0.486: 0 (28.0/3.0)
|   |   |   |   |   |   |   |   |   interest3 > 0
|   |   |   |   |   |   |   |   |   |   house_area <= 0.675: 1 (3.0)
|   |   |   |   |   |   |   |   |   |   house_area > 0.675: 0 (2.0)
|   |   |   |   |   |   |   |   parent_age > 57: 1 (3.0)
|   |   |   |   |   |   |   school_accreditation0 > 0
|   |   |   |   |   |   |   |   parent_salary <= 0.451111: 0 (3.0)
|   |   |   |   |   |   |   |   parent_salary > 0.451111
|   |   |   |   |   |   |   |   |   house_area <= 0.503: 0 (1.0)
|   |   |   |   |   |   |   |   |   house_area > 0.503: 1 (9.0)
|   |   |   |   |   |   interest1 > 0: 0 (6.0)
|   |   |   |   residence0 > 0
|   |   |   |   |   house_area <= 0.635
|   |   |   |   |   |   parent_was_in_college0 <= 0
|   |   |   |   |   |   |   parent_salary <= 0.586667: 1 (22.0/2.0)
|   |   |   |   |   |   |   parent_salary > 0.586667
|   |   |   |   |   |   |   |   parent_age <= 50
|   |   |   |   |   |   |   |   |   type_school0 <= 0: 1 (2.0)
|   |   |   |   |   |   |   |   |   type_school0 > 0
|   |   |   |   |   |   |   |   |   |   house_area <= 0.569: 1 (8.0)
|   |   |   |   |   |   |   |   |   |   house_area > 0.569: 0 (2.0)
|   |   |   |   |   |   |   |   parent_age > 50: 0 (2.0)
|   |   |   |   |   |   parent_was_in_college0 > 0: 1 (23.0)
|   |   |   |   |   house_area > 0.635
|   |   |   |   |   |   parent_age <= 46: 1 (4.0)
|   |   |   |   |   |   parent_age > 46
|   |   |   |   |   |   |   interest2 <= 0: 0 (17.0/2.0)
|   |   |   |   |   |   |   interest2 > 0
|   |   |   |   |   |   |   |   type_school0 <= 0: 1 (2.0)
|   |   |   |   |   |   |   |   type_school0 > 0: 0 (1.0)
average_gradesA > 0: 0 (123.0/2.0)

Number of Leaves  :     85

Size of the tree :  169

Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 486  10
         0  14 490
                                          
               Accuracy : 0.976           
                 95% CI : (0.9645, 0.9846)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.952           
                                          
 Mcnemar's Test P-Value : 0.5403          
                                          
            Sensitivity : 0.9720          
            Specificity : 0.9800          
         Pos Pred Value : 0.9798          
         Neg Pred Value : 0.9722          
             Prevalence : 0.5000          
         Detection Rate : 0.4860          
   Detection Prevalence : 0.4960          
      Balanced Accuracy : 0.9760          
                                          
       'Positive' Class : 1               
                                          
fold = 3 
J48 pruned tree
------------------

average_gradesA <= 0
|   parent_salary <= 0.416667
|   |   house_area <= 0.559
|   |   |   interest0 <= 0: 1 (187.0/3.0)
|   |   |   interest0 > 0
|   |   |   |   house_area <= 0.383
|   |   |   |   |   residence0 <= 0
|   |   |   |   |   |   parent_age <= 58: 0 (3.0)
|   |   |   |   |   |   parent_age > 58: 1 (1.0)
|   |   |   |   |   residence0 > 0: 1 (1.0)
|   |   |   |   house_area > 0.383: 1 (15.0)
|   |   house_area > 0.559
|   |   |   parent_was_in_college0 <= 0
|   |   |   |   type_school0 <= 0
|   |   |   |   |   average_grades+B <= 0
|   |   |   |   |   |   average_gradesB <= 0
|   |   |   |   |   |   |   interest2 <= 0: 0 (5.0)
|   |   |   |   |   |   |   interest2 > 0: 1 (1.0)
|   |   |   |   |   |   average_gradesB > 0: 1 (9.0)
|   |   |   |   |   average_grades+B > 0: 1 (12.0)
|   |   |   |   type_school0 > 0: 1 (28.0)
|   |   |   parent_was_in_college0 > 0
|   |   |   |   parent_salary <= 0.368889
|   |   |   |   |   average_grades+C <= 0
|   |   |   |   |   |   parent_age <= 60: 1 (19.0/2.0)
|   |   |   |   |   |   parent_age > 60: 0 (2.0)
|   |   |   |   |   average_grades+C > 0: 0 (5.0)
|   |   |   |   parent_salary > 0.368889: 0 (20.0/1.0)
|   parent_salary > 0.416667
|   |   house_area <= 0.413
|   |   |   interest3 <= 0
|   |   |   |   type_school0 <= 0
|   |   |   |   |   school_accreditation0 <= 0: 1 (26.0)
|   |   |   |   |   school_accreditation0 > 0
|   |   |   |   |   |   parent_salary <= 0.524444: 1 (6.0)
|   |   |   |   |   |   parent_salary > 0.524444: 0 (2.0)
|   |   |   |   type_school0 > 0
|   |   |   |   |   gender0 <= 0
|   |   |   |   |   |   interest0 <= 0: 0 (6.0)
|   |   |   |   |   |   interest0 > 0
|   |   |   |   |   |   |   house_area <= 0.348: 0 (3.0)
|   |   |   |   |   |   |   house_area > 0.348: 1 (2.0)
|   |   |   |   |   gender0 > 0
|   |   |   |   |   |   school_accreditation0 <= 0
|   |   |   |   |   |   |   parent_was_in_college0 <= 0
|   |   |   |   |   |   |   |   parent_age <= 51: 0 (3.0)
|   |   |   |   |   |   |   |   parent_age > 51: 1 (1.0)
|   |   |   |   |   |   |   parent_was_in_college0 > 0
|   |   |   |   |   |   |   |   parent_age <= 53: 1 (9.0)
|   |   |   |   |   |   |   |   parent_age > 53: 0 (2.0)
|   |   |   |   |   |   school_accreditation0 > 0: 1 (3.0)
|   |   |   interest3 > 0: 1 (28.0)
|   |   house_area > 0.413
|   |   |   average_gradesB <= 0
|   |   |   |   type_school0 <= 0
|   |   |   |   |   school_accreditation0 <= 0
|   |   |   |   |   |   residence0 <= 0
|   |   |   |   |   |   |   parent_salary <= 0.503333
|   |   |   |   |   |   |   |   interest2 <= 0
|   |   |   |   |   |   |   |   |   house_area <= 0.665
|   |   |   |   |   |   |   |   |   |   parent_age <= 53
|   |   |   |   |   |   |   |   |   |   |   parent_salary <= 0.496667: 0 (4.0)
|   |   |   |   |   |   |   |   |   |   |   parent_salary > 0.496667: 1 (1.0)
|   |   |   |   |   |   |   |   |   |   parent_age > 53: 1 (5.0)
|   |   |   |   |   |   |   |   |   house_area > 0.665: 0 (4.0)
|   |   |   |   |   |   |   |   interest2 > 0: 0 (5.0)
|   |   |   |   |   |   |   parent_salary > 0.503333: 0 (41.0/1.0)
|   |   |   |   |   |   residence0 > 0
|   |   |   |   |   |   |   interest2 <= 0
|   |   |   |   |   |   |   |   interest0 <= 0
|   |   |   |   |   |   |   |   |   gender0 <= 0: 0 (1.0)
|   |   |   |   |   |   |   |   |   gender0 > 0
|   |   |   |   |   |   |   |   |   |   interest1 <= 0
|   |   |   |   |   |   |   |   |   |   |   parent_age <= 54: 1 (3.0)
|   |   |   |   |   |   |   |   |   |   |   parent_age > 54: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   interest1 > 0: 0 (1.0)
|   |   |   |   |   |   |   |   interest0 > 0: 0 (9.0)
|   |   |   |   |   |   |   interest2 > 0: 1 (2.0)
|   |   |   |   |   school_accreditation0 > 0: 0 (95.0/2.0)
|   |   |   |   type_school0 > 0
|   |   |   |   |   interest2 <= 0
|   |   |   |   |   |   interest3 <= 0
|   |   |   |   |   |   |   residence0 <= 0
|   |   |   |   |   |   |   |   interest0 <= 0: 0 (14.0)
|   |   |   |   |   |   |   |   interest0 > 0
|   |   |   |   |   |   |   |   |   house_area <= 0.462: 0 (3.0)
|   |   |   |   |   |   |   |   |   house_area > 0.462
|   |   |   |   |   |   |   |   |   |   parent_age <= 52: 0 (2.0)
|   |   |   |   |   |   |   |   |   |   parent_age > 52: 1 (4.0/1.0)
|   |   |   |   |   |   |   residence0 > 0
|   |   |   |   |   |   |   |   school_accreditation0 <= 0
|   |   |   |   |   |   |   |   |   parent_salary <= 0.495556: 1 (7.0)
|   |   |   |   |   |   |   |   |   parent_salary > 0.495556
|   |   |   |   |   |   |   |   |   |   parent_age <= 48: 1 (3.0)
|   |   |   |   |   |   |   |   |   |   parent_age > 48
|   |   |   |   |   |   |   |   |   |   |   gender0 <= 0: 0 (15.0)
|   |   |   |   |   |   |   |   |   |   |   gender0 > 0
|   |   |   |   |   |   |   |   |   |   |   |   house_area <= 0.618
|   |   |   |   |   |   |   |   |   |   |   |   |   parent_was_in_college0 <= 0
|   |   |   |   |   |   |   |   |   |   |   |   |   |   interest1 <= 0: 0 (4.0/1.0)
|   |   |   |   |   |   |   |   |   |   |   |   |   |   interest1 > 0: 1 (1.0)
|   |   |   |   |   |   |   |   |   |   |   |   |   parent_was_in_college0 > 0
|   |   |   |   |   |   |   |   |   |   |   |   |   |   interest1 <= 0: 1 (4.0)
|   |   |   |   |   |   |   |   |   |   |   |   |   |   interest1 > 0: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   |   |   house_area > 0.618: 0 (9.0)
|   |   |   |   |   |   |   |   school_accreditation0 > 0
|   |   |   |   |   |   |   |   |   interest0 <= 0
|   |   |   |   |   |   |   |   |   |   parent_salary <= 0.587778
|   |   |   |   |   |   |   |   |   |   |   average_grades+B <= 0: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   |   average_grades+B > 0
|   |   |   |   |   |   |   |   |   |   |   |   house_area <= 0.47: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   |   |   house_area > 0.47: 1 (11.0)
|   |   |   |   |   |   |   |   |   |   parent_salary > 0.587778: 0 (8.0/2.0)
|   |   |   |   |   |   |   |   |   interest0 > 0
|   |   |   |   |   |   |   |   |   |   parent_salary <= 0.676667: 1 (24.0/2.0)
|   |   |   |   |   |   |   |   |   |   parent_salary > 0.676667
|   |   |   |   |   |   |   |   |   |   |   house_area <= 0.5: 1 (1.0)
|   |   |   |   |   |   |   |   |   |   |   house_area > 0.5: 0 (3.0)
|   |   |   |   |   |   interest3 > 0: 0 (7.0)
|   |   |   |   |   interest2 > 0: 0 (24.0)
|   |   |   average_gradesB > 0
|   |   |   |   residence0 <= 0
|   |   |   |   |   parent_was_in_college0 <= 0: 0 (9.0)
|   |   |   |   |   parent_was_in_college0 > 0
|   |   |   |   |   |   interest1 <= 0
|   |   |   |   |   |   |   school_accreditation0 <= 0
|   |   |   |   |   |   |   |   parent_age <= 57
|   |   |   |   |   |   |   |   |   interest3 <= 0
|   |   |   |   |   |   |   |   |   |   house_area <= 0.486
|   |   |   |   |   |   |   |   |   |   |   type_school0 <= 0
|   |   |   |   |   |   |   |   |   |   |   |   parent_salary <= 0.63: 1 (3.0)
|   |   |   |   |   |   |   |   |   |   |   |   parent_salary > 0.63: 0 (3.0)
|   |   |   |   |   |   |   |   |   |   |   type_school0 > 0
|   |   |   |   |   |   |   |   |   |   |   |   parent_age <= 54
|   |   |   |   |   |   |   |   |   |   |   |   |   parent_salary <= 0.458889: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   |   |   |   parent_salary > 0.458889: 1 (3.0)
|   |   |   |   |   |   |   |   |   |   |   |   parent_age > 54: 0 (2.0)
|   |   |   |   |   |   |   |   |   |   house_area > 0.486: 0 (28.0/3.0)
|   |   |   |   |   |   |   |   |   interest3 > 0
|   |   |   |   |   |   |   |   |   |   house_area <= 0.675: 1 (3.0)
|   |   |   |   |   |   |   |   |   |   house_area > 0.675: 0 (2.0)
|   |   |   |   |   |   |   |   parent_age > 57: 1 (3.0)
|   |   |   |   |   |   |   school_accreditation0 > 0
|   |   |   |   |   |   |   |   parent_salary <= 0.451111: 0 (3.0)
|   |   |   |   |   |   |   |   parent_salary > 0.451111
|   |   |   |   |   |   |   |   |   house_area <= 0.503: 0 (1.0)
|   |   |   |   |   |   |   |   |   house_area > 0.503: 1 (9.0)
|   |   |   |   |   |   interest1 > 0: 0 (6.0)
|   |   |   |   residence0 > 0
|   |   |   |   |   house_area <= 0.635
|   |   |   |   |   |   parent_was_in_college0 <= 0
|   |   |   |   |   |   |   parent_salary <= 0.586667: 1 (22.0/2.0)
|   |   |   |   |   |   |   parent_salary > 0.586667
|   |   |   |   |   |   |   |   parent_age <= 50
|   |   |   |   |   |   |   |   |   type_school0 <= 0: 1 (2.0)
|   |   |   |   |   |   |   |   |   type_school0 > 0
|   |   |   |   |   |   |   |   |   |   house_area <= 0.569: 1 (8.0)
|   |   |   |   |   |   |   |   |   |   house_area > 0.569: 0 (2.0)
|   |   |   |   |   |   |   |   parent_age > 50: 0 (2.0)
|   |   |   |   |   |   parent_was_in_college0 > 0: 1 (23.0)
|   |   |   |   |   house_area > 0.635
|   |   |   |   |   |   parent_age <= 46: 1 (4.0)
|   |   |   |   |   |   parent_age > 46
|   |   |   |   |   |   |   interest2 <= 0: 0 (17.0/2.0)
|   |   |   |   |   |   |   interest2 > 0
|   |   |   |   |   |   |   |   type_school0 <= 0: 1 (2.0)
|   |   |   |   |   |   |   |   type_school0 > 0: 0 (1.0)
average_gradesA > 0: 0 (123.0/2.0)

Number of Leaves  :     85

Size of the tree :  169

Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 486  10
         0  14 490
                                          
               Accuracy : 0.976           
                 95% CI : (0.9645, 0.9846)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.952           
                                          
 Mcnemar's Test P-Value : 0.5403          
                                          
            Sensitivity : 0.9720          
            Specificity : 0.9800          
         Pos Pred Value : 0.9798          
         Neg Pred Value : 0.9722          
             Prevalence : 0.5000          
         Detection Rate : 0.4860          
   Detection Prevalence : 0.4960          
      Balanced Accuracy : 0.9760          
                                          
       'Positive' Class : 1               
                                          
fold = 4 
J48 pruned tree
------------------

average_gradesA <= 0
|   parent_salary <= 0.416667
|   |   house_area <= 0.559: 1 (207.0/6.0)
|   |   house_area > 0.559
|   |   |   parent_was_in_college0 <= 0
|   |   |   |   type_school0 <= 0
|   |   |   |   |   average_grades+B <= 0
|   |   |   |   |   |   average_gradesB <= 0
|   |   |   |   |   |   |   interest2 <= 0: 0 (5.0)
|   |   |   |   |   |   |   interest2 > 0: 1 (1.0)
|   |   |   |   |   |   average_gradesB > 0: 1 (9.0)
|   |   |   |   |   average_grades+B > 0: 1 (12.0)
|   |   |   |   type_school0 > 0: 1 (28.0)
|   |   |   parent_was_in_college0 > 0
|   |   |   |   parent_salary <= 0.368889
|   |   |   |   |   average_grades+C <= 0
|   |   |   |   |   |   parent_age <= 60: 1 (19.0/2.0)
|   |   |   |   |   |   parent_age > 60: 0 (2.0)
|   |   |   |   |   average_grades+C > 0: 0 (5.0)
|   |   |   |   parent_salary > 0.368889: 0 (20.0/1.0)
|   parent_salary > 0.416667
|   |   house_area <= 0.413
|   |   |   interest3 <= 0
|   |   |   |   type_school0 <= 0
|   |   |   |   |   school_accreditation0 <= 0: 1 (26.0)
|   |   |   |   |   school_accreditation0 > 0
|   |   |   |   |   |   parent_salary <= 0.524444: 1 (6.0)
|   |   |   |   |   |   parent_salary > 0.524444: 0 (2.0)
|   |   |   |   type_school0 > 0
|   |   |   |   |   gender0 <= 0
|   |   |   |   |   |   interest0 <= 0: 0 (6.0)
|   |   |   |   |   |   interest0 > 0
|   |   |   |   |   |   |   house_area <= 0.348: 0 (3.0)
|   |   |   |   |   |   |   house_area > 0.348: 1 (2.0)
|   |   |   |   |   gender0 > 0
|   |   |   |   |   |   school_accreditation0 <= 0
|   |   |   |   |   |   |   parent_was_in_college0 <= 0
|   |   |   |   |   |   |   |   parent_age <= 51: 0 (3.0)
|   |   |   |   |   |   |   |   parent_age > 51: 1 (1.0)
|   |   |   |   |   |   |   parent_was_in_college0 > 0
|   |   |   |   |   |   |   |   parent_age <= 53: 1 (9.0)
|   |   |   |   |   |   |   |   parent_age > 53: 0 (2.0)
|   |   |   |   |   |   school_accreditation0 > 0: 1 (3.0)
|   |   |   interest3 > 0: 1 (28.0)
|   |   house_area > 0.413
|   |   |   average_gradesB <= 0
|   |   |   |   type_school0 <= 0: 0 (172.0/14.0)
|   |   |   |   type_school0 > 0
|   |   |   |   |   interest2 <= 0
|   |   |   |   |   |   residence0 <= 0: 0 (26.0/3.0)
|   |   |   |   |   |   residence0 > 0
|   |   |   |   |   |   |   school_accreditation0 <= 0
|   |   |   |   |   |   |   |   parent_salary <= 0.495556: 1 (7.0)
|   |   |   |   |   |   |   |   parent_salary > 0.495556
|   |   |   |   |   |   |   |   |   parent_age <= 48: 1 (4.0/1.0)
|   |   |   |   |   |   |   |   |   parent_age > 48
|   |   |   |   |   |   |   |   |   |   gender0 <= 0: 0 (16.0)
|   |   |   |   |   |   |   |   |   |   gender0 > 0
|   |   |   |   |   |   |   |   |   |   |   house_area <= 0.618
|   |   |   |   |   |   |   |   |   |   |   |   parent_was_in_college0 <= 0
|   |   |   |   |   |   |   |   |   |   |   |   |   interest1 <= 0: 0 (4.0/1.0)
|   |   |   |   |   |   |   |   |   |   |   |   |   interest1 > 0: 1 (1.0)
|   |   |   |   |   |   |   |   |   |   |   |   parent_was_in_college0 > 0
|   |   |   |   |   |   |   |   |   |   |   |   |   interest1 <= 0: 1 (4.0)
|   |   |   |   |   |   |   |   |   |   |   |   |   interest1 > 0: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   |   house_area > 0.618: 0 (9.0)
|   |   |   |   |   |   |   school_accreditation0 > 0
|   |   |   |   |   |   |   |   interest0 <= 0
|   |   |   |   |   |   |   |   |   parent_salary <= 0.587778
|   |   |   |   |   |   |   |   |   |   average_grades+B <= 0: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   average_grades+B > 0
|   |   |   |   |   |   |   |   |   |   |   house_area <= 0.47: 0 (1.0)
|   |   |   |   |   |   |   |   |   |   |   house_area > 0.47: 1 (11.0)
|   |   |   |   |   |   |   |   |   parent_salary > 0.587778: 0 (10.0/2.0)
|   |   |   |   |   |   |   |   interest0 > 0
|   |   |   |   |   |   |   |   |   parent_salary <= 0.676667: 1 (24.0/2.0)
|   |   |   |   |   |   |   |   |   parent_salary > 0.676667
|   |   |   |   |   |   |   |   |   |   house_area <= 0.5: 1 (1.0)
|   |   |   |   |   |   |   |   |   |   house_area > 0.5: 0 (3.0)
|   |   |   |   |   interest2 > 0: 0 (24.0)
|   |   |   average_gradesB > 0
|   |   |   |   residence0 <= 0
|   |   |   |   |   parent_was_in_college0 <= 0: 0 (9.0)
|   |   |   |   |   parent_was_in_college0 > 0
|   |   |   |   |   |   school_accreditation0 <= 0
|   |   |   |   |   |   |   interest3 <= 0: 0 (45.0/10.0)
|   |   |   |   |   |   |   interest3 > 0
|   |   |   |   |   |   |   |   house_area <= 0.675: 1 (5.0)
|   |   |   |   |   |   |   |   house_area > 0.675: 0 (2.0)
|   |   |   |   |   |   school_accreditation0 > 0
|   |   |   |   |   |   |   parent_salary <= 0.451111: 0 (4.0)
|   |   |   |   |   |   |   parent_salary > 0.451111
|   |   |   |   |   |   |   |   house_area <= 0.503: 0 (2.0)
|   |   |   |   |   |   |   |   house_area > 0.503: 1 (9.0)
|   |   |   |   residence0 > 0
|   |   |   |   |   house_area <= 0.635: 1 (59.0/6.0)
|   |   |   |   |   house_area > 0.635
|   |   |   |   |   |   parent_age <= 46: 1 (4.0)
|   |   |   |   |   |   parent_age > 46
|   |   |   |   |   |   |   interest2 <= 0: 0 (17.0/2.0)
|   |   |   |   |   |   |   interest2 > 0
|   |   |   |   |   |   |   |   type_school0 <= 0: 1 (2.0)
|   |   |   |   |   |   |   |   type_school0 > 0: 0 (1.0)
average_gradesA > 0: 0 (123.0/2.0)

Number of Leaves  :     53

Size of the tree :  105

Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 465  17
         0  35 483
                                          
               Accuracy : 0.948           
                 95% CI : (0.9324, 0.9609)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.896           
                                          
 Mcnemar's Test P-Value : 0.0184          
                                          
            Sensitivity : 0.9300          
            Specificity : 0.9660          
         Pos Pred Value : 0.9647          
         Neg Pred Value : 0.9324          
             Prevalence : 0.5000          
         Detection Rate : 0.4650          
   Detection Prevalence : 0.4820          
      Balanced Accuracy : 0.9480          
                                          
       'Positive' Class : 1               
                                          

The gain ratio consistently favors unbalanced splits, as demonstrated by its selection of “Parent salary” as the root for all three trees even though it’s shown in the tree “average grades” as the root but the split point’s that all the value in one diraction . In this configuration, one partition is notably smaller than the others, and the feature exhibits a higher number of distinct values. Despite the fact that the node corresponding to “Parent age”, “parent was in collage” is not pure, the resulting trees exhibit impressive accuracy levels, all surpassing 94%

cart

library(caret)
library(rpart)
library(rpart.plot)



set.seed(1234) # Random seed
fold <- c(2, 3, 4)  # Values of xval (number of folds) to try

for (fold in fold) {
cat("fold"=fold)
trctrl <- trainControl(method = "cv", number = fold, savePredictions = TRUE)
dt_fit <- train(factor(will_go_to_college) ~ ., data = data, method = "rpart1SE", trControl = trctrl)

 # Plot decision tree
  rpart.plot(dt_fit$finalModel)
  
# Get predicted values
predictions <- predict(dt_fit, newdata = data)

# Create confusion matrix
confusion_matrix <- confusionMatrix(predictions, data$will_go_to_college)

# Print the confusion matrix
print(confusion_matrix)

#print each flod
pred <- dt_fit$pred
pred$equal <- ifelse(pred$pred == pred$obs, 1,0)
eachfold <- pred %>%                                        
  group_by(Resample) %>%                         
  summarise_at(vars(equal),                     
               list(Accuracy = mean))              
print(eachfold  )

  }
2
Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
To silence this warning:
    Call rpart.plot with roundint=FALSE,
    or rebuild the rpart model with model=TRUE.
Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 430  67
         0  70 433
                                          
               Accuracy : 0.863           
                 95% CI : (0.8401, 0.8837)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.726           
                                          
 Mcnemar's Test P-Value : 0.8643          
                                          
            Sensitivity : 0.8600          
            Specificity : 0.8660          
         Pos Pred Value : 0.8652          
         Neg Pred Value : 0.8608          
             Prevalence : 0.5000          
         Detection Rate : 0.4300          
   Detection Prevalence : 0.4970          
      Balanced Accuracy : 0.8630          
                                          
       'Positive' Class : 1               
                                          
3

Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 430  67
         0  70 433
                                          
               Accuracy : 0.863           
                 95% CI : (0.8401, 0.8837)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.726           
                                          
 Mcnemar's Test P-Value : 0.8643          
                                          
            Sensitivity : 0.8600          
            Specificity : 0.8660          
         Pos Pred Value : 0.8652          
         Neg Pred Value : 0.8608          
             Prevalence : 0.5000          
         Detection Rate : 0.4300          
   Detection Prevalence : 0.4970          
      Balanced Accuracy : 0.8630          
                                          
       'Positive' Class : 1               
                                          
4

Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 430  67
         0  70 433
                                          
               Accuracy : 0.863           
                 95% CI : (0.8401, 0.8837)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.726           
                                          
 Mcnemar's Test P-Value : 0.8643          
                                          
            Sensitivity : 0.8600          
            Specificity : 0.8660          
         Pos Pred Value : 0.8652          
         Neg Pred Value : 0.8608          
             Prevalence : 0.5000          
         Detection Rate : 0.4300          
   Detection Prevalence : 0.4970          
      Balanced Accuracy : 0.8630          
                                          
       'Positive' Class : 1               
                                          

averages grade exhibits the smallest Gini index binary split, signifying a substantial reduction in impurity. Hence, it is chosen as the splitting attribute. Conversely, attributes such as ‘type_school,’ ‘school_accreditation,’ ‘gender,’ ‘parent_age,’ and ‘parent_was_in_college’ yield minimal impurity reduction, leading to their exclusion from the tree. The dataset’s balanced class labels and marginal differences in accuracy across folds result in consistent tree structures, as evidenced by the identical trees in all folds. For further details, refer to the index. Overall, the model attains an 86% accuracy, emphasizing its effectiveness.

ID3

library(caret)
library(partykit)
Loading required package: grid
Loading required package: libcoin
Loading required package: mvtnorm
library(dplyr)

fold <- c(4, 3, 2)  # Values of xval (number of folds) to try

for (fold in fold) {
  cat("fold =", fold, "\n")
  trctrl <- trainControl(method = "cv", number = fold, savePredictions = TRUE)
  
  id_fit <- train(factor(will_go_to_college) ~ ., data = data, method = "ctree", trControl = trctrl)
   print(id_fit$finalModel)
  # Get predicted values
  predictions <- predict(id_fit, newdata = data)
  
  # Create confusion matrix
  confusion_matrix <- confusionMatrix(predictions, data$will_go_to_college)
  
  # Print the confusion matrix
  print(confusion_matrix)
  
  # Print accuracy for each fold
  pred <- id_fit$pred
  pred$equal <- ifelse(pred$pred == pred$obs, 1, 0)
  eachfold <- pred %>%
    group_by(Resample) %>%
    summarise_at(vars(equal), list(Accuracy = mean))
  print(eachfold)

  # Plot decision tree
  plot(id_fit$finalModel, main = paste("Decision Tree (Fold", fold, ")"))
  
}
fold = 4 

     Conditional inference tree with 53 terminal nodes

Response:  .outcome 
Inputs:  type_school0, school_accreditation0, gender0, interest3, interest2, interest1, interest0, residence0, parent_age, parent_salary, house_area, average_gradesA, average_grades+B, average_gradesB, average_grades+C, average_gradesC, average_grades+D, average_gradesD, average_gradesF, parent_was_in_college0 
Number of observations:  1000 

1) parent_salary <= 0.4166667; criterion = 1, statistic = 226.057
  2) average_gradesA <= 0; criterion = 1, statistic = 104.378
    3) house_area <= 0.559; criterion = 1, statistic = 43.299
      4) interest0 <= 0; criterion = 0.986, statistic = 11.464
        5) interest1 <= 0; criterion = 0.54, statistic = 4.692
          6) type_school0 <= 0; criterion = 0.024, statistic = 1.878
            7) average_grades+B <= 0; criterion = 0.058, statistic = 2.261
              8)*  weights = 104 
            7) average_grades+B > 0
              9) school_accreditation0 <= 0; criterion = 0.024, statistic = 1.875
                10)*  weights = 16 
              9) school_accreditation0 > 0
                11)*  weights = 30 
          6) type_school0 > 0
            12) house_area <= 0.298; criterion = 0.062, statistic = 2.299
              13)*  weights = 7 
            12) house_area > 0.298
              14)*  weights = 20 
        5) interest1 > 0
          15)*  weights = 10 
      4) interest0 > 0
        16) house_area <= 0.42; criterion = 0.808, statistic = 6.529
          17)*  weights = 7 
        16) house_area > 0.42
          18)*  weights = 13 
    3) house_area > 0.559
      19) parent_was_in_college0 <= 0; criterion = 1, statistic = 30.227
        20) type_school0 <= 0; criterion = 0.696, statistic = 5.6
          21) house_area <= 0.688; criterion = 0.864, statistic = 7.199
            22) interest0 <= 0; criterion = 0.022, statistic = 1.857
              23)*  weights = 13 
            22) interest0 > 0
              24)*  weights = 7 
          21) house_area > 0.688
            25)*  weights = 7 
        20) type_school0 > 0
          26)*  weights = 28 
      19) parent_was_in_college0 > 0
        27) parent_salary <= 0.2966667; criterion = 1, statistic = 17.824
          28)*  weights = 18 
        27) parent_salary > 0.2966667
          29) interest2 <= 0; criterion = 0.044, statistic = 12.462
            30)*  weights = 19 
          29) interest2 > 0
            31)*  weights = 9 
  2) average_gradesA > 0
    32) parent_age <= 54; criterion = 0.358, statistic = 3.84
      33)*  weights = 18 
    32) parent_age > 54
      34)*  weights = 7 
1) parent_salary > 0.4166667
  35) house_area <= 0.413; criterion = 1, statistic = 129.968
    36) type_school0 <= 0; criterion = 0.998, statistic = 19.583
      37) school_accreditation0 <= 0; criterion = 0.745, statistic = 6.509
        38)*  weights = 35 
      37) school_accreditation0 > 0
        39)*  weights = 12 
    36) type_school0 > 0
      40) interest3 <= 0; criterion = 0.993, statistic = 12.818
        41) gender0 <= 0; criterion = 0.889, statistic = 7.586
          42)*  weights = 13 
        41) gender0 > 0
          43)*  weights = 20 
      40) interest3 > 0
        44)*  weights = 15 
  35) house_area > 0.413
    45) average_gradesB <= 0; criterion = 1, statistic = 77.328
      46) type_school0 <= 0; criterion = 1, statistic = 52.367
        47) school_accreditation0 <= 0; criterion = 0.997, statistic = 14.096
          48) gender0 <= 0; criterion = 0.364, statistic = 3.865
            49) parent_age <= 54; criterion = 0.231, statistic = 3.27
              50) residence0 <= 0; criterion = 0.487, statistic = 4.429
                51)*  weights = 31 
              50) residence0 > 0
                52)*  weights = 7 
            49) parent_age > 54
              53)*  weights = 10 
          48) gender0 > 0
            54) parent_salary <= 0.5444444; criterion = 0.489, statistic = 4.44
              55)*  weights = 18 
            54) parent_salary > 0.5444444
              56) house_area <= 0.516; criterion = 0.058, statistic = 16.64
                57)*  weights = 7 
              56) house_area > 0.516
                58)*  weights = 20 
        47) school_accreditation0 > 0
          59) gender0 <= 0; criterion = 0.038, statistic = 2.068
            60)*  weights = 76 
          59) gender0 > 0
            61) interest0 <= 0; criterion = 0.011, statistic = 1.632
              62) residence0 <= 0; criterion = 0.029, statistic = 1.954
                63)*  weights = 20 
              62) residence0 > 0
                64) parent_salary <= 0.5577778; criterion = 0.015, statistic = 2.193
                  65)*  weights = 7 
                64) parent_salary > 0.5577778
                  66)*  weights = 14 
            61) interest0 > 0
              67)*  weights = 33 
      46) type_school0 > 0
        68) interest0 <= 0; criterion = 1, statistic = 25.096
          69) interest1 <= 0; criterion = 1, statistic = 25.014
            70) residence0 <= 0; criterion = 0.569, statistic = 4.838
              71)*  weights = 32 
            70) residence0 > 0
              72) parent_salary <= 0.5544444; criterion = 0.663, statistic = 5.383
                73)*  weights = 7 
              72) parent_salary > 0.5544444
                74) house_area <= 0.572; criterion = 0.011, statistic = 1.616
                  75)*  weights = 17 
                74) house_area > 0.572
                  76)*  weights = 26 
          69) interest1 > 0
            77) average_grades+B <= 0; criterion = 0.912, statistic = 8.042
              78)*  weights = 8 
            77) average_grades+B > 0
              79)*  weights = 16 
        68) interest0 > 0
          80) average_gradesA <= 0; criterion = 0.987, statistic = 11.571
            81) residence0 <= 0; criterion = 0.48, statistic = 4.395
              82)*  weights = 9 
            81) residence0 > 0
              83) house_area <= 0.695; criterion = 0.647, statistic = 5.281
                84) parent_salary <= 0.5344444; criterion = 0.4, statistic = 4.026
                  85)*  weights = 13 
                84) parent_salary > 0.5344444
                  86)*  weights = 24 
              83) house_area > 0.695
                87)*  weights = 10 
          80) average_gradesA > 0
            88)*  weights = 8 
    45) average_gradesB > 0
      89) residence0 <= 0; criterion = 1, statistic = 27.838
        90) parent_age <= 54; criterion = 0.923, statistic = 8.29
          91) house_area <= 0.562; criterion = 0.558, statistic = 4.783
            92) interest2 <= 0; criterion = 0.161, statistic = 4.312
              93) gender0 <= 0; criterion = 0.347, statistic = 6.609
                94)*  weights = 13 
              93) gender0 > 0
                95)*  weights = 12 
            92) interest2 > 0
              96)*  weights = 9 
          91) house_area > 0.562
            97)*  weights = 18 
        90) parent_age > 54
          98)*  weights = 24 
      89) residence0 > 0
        99) house_area <= 0.633; criterion = 1, statistic = 27.626
          100) parent_was_in_college0 <= 0; criterion = 0.437, statistic = 4.195
            101)*  weights = 36 
          100) parent_was_in_college0 > 0
            102)*  weights = 23 
        99) house_area > 0.633
          103) parent_salary <= 0.5466667; criterion = 0.516, statistic = 4.568
            104)*  weights = 16 
          103) parent_salary > 0.5466667
            105)*  weights = 8 
Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 456  60
         0  44 440
                                          
               Accuracy : 0.896           
                 95% CI : (0.8754, 0.9142)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.792           
                                          
 Mcnemar's Test P-Value : 0.1413          
                                          
            Sensitivity : 0.9120          
            Specificity : 0.8800          
         Pos Pred Value : 0.8837          
         Neg Pred Value : 0.9091          
             Prevalence : 0.5000          
         Detection Rate : 0.4560          
   Detection Prevalence : 0.5160          
      Balanced Accuracy : 0.8960          
                                          
       'Positive' Class : 1               
                                          
fold = 3 

     Conditional inference tree with 53 terminal nodes

Response:  .outcome 
Inputs:  type_school0, school_accreditation0, gender0, interest3, interest2, interest1, interest0, residence0, parent_age, parent_salary, house_area, average_gradesA, average_grades+B, average_gradesB, average_grades+C, average_gradesC, average_grades+D, average_gradesD, average_gradesF, parent_was_in_college0 
Number of observations:  1000 

1) parent_salary <= 0.4166667; criterion = 1, statistic = 226.057
  2) average_gradesA <= 0; criterion = 1, statistic = 104.378
    3) house_area <= 0.559; criterion = 1, statistic = 43.299
      4) interest0 <= 0; criterion = 0.986, statistic = 11.464
        5) interest1 <= 0; criterion = 0.54, statistic = 4.692
          6) type_school0 <= 0; criterion = 0.024, statistic = 1.878
            7) average_grades+B <= 0; criterion = 0.058, statistic = 2.261
              8)*  weights = 104 
            7) average_grades+B > 0
              9) school_accreditation0 <= 0; criterion = 0.024, statistic = 1.875
                10)*  weights = 16 
              9) school_accreditation0 > 0
                11)*  weights = 30 
          6) type_school0 > 0
            12) house_area <= 0.298; criterion = 0.062, statistic = 2.299
              13)*  weights = 7 
            12) house_area > 0.298
              14)*  weights = 20 
        5) interest1 > 0
          15)*  weights = 10 
      4) interest0 > 0
        16) house_area <= 0.42; criterion = 0.808, statistic = 6.529
          17)*  weights = 7 
        16) house_area > 0.42
          18)*  weights = 13 
    3) house_area > 0.559
      19) parent_was_in_college0 <= 0; criterion = 1, statistic = 30.227
        20) type_school0 <= 0; criterion = 0.696, statistic = 5.6
          21) house_area <= 0.688; criterion = 0.864, statistic = 7.199
            22) interest0 <= 0; criterion = 0.022, statistic = 1.857
              23)*  weights = 13 
            22) interest0 > 0
              24)*  weights = 7 
          21) house_area > 0.688
            25)*  weights = 7 
        20) type_school0 > 0
          26)*  weights = 28 
      19) parent_was_in_college0 > 0
        27) parent_salary <= 0.2966667; criterion = 1, statistic = 17.824
          28)*  weights = 18 
        27) parent_salary > 0.2966667
          29) interest2 <= 0; criterion = 0.044, statistic = 12.462
            30)*  weights = 19 
          29) interest2 > 0
            31)*  weights = 9 
  2) average_gradesA > 0
    32) parent_age <= 54; criterion = 0.358, statistic = 3.84
      33)*  weights = 18 
    32) parent_age > 54
      34)*  weights = 7 
1) parent_salary > 0.4166667
  35) house_area <= 0.413; criterion = 1, statistic = 129.968
    36) type_school0 <= 0; criterion = 0.998, statistic = 19.583
      37) school_accreditation0 <= 0; criterion = 0.745, statistic = 6.509
        38)*  weights = 35 
      37) school_accreditation0 > 0
        39)*  weights = 12 
    36) type_school0 > 0
      40) interest3 <= 0; criterion = 0.993, statistic = 12.818
        41) gender0 <= 0; criterion = 0.889, statistic = 7.586
          42)*  weights = 13 
        41) gender0 > 0
          43)*  weights = 20 
      40) interest3 > 0
        44)*  weights = 15 
  35) house_area > 0.413
    45) average_gradesB <= 0; criterion = 1, statistic = 77.328
      46) type_school0 <= 0; criterion = 1, statistic = 52.367
        47) school_accreditation0 <= 0; criterion = 0.997, statistic = 14.096
          48) gender0 <= 0; criterion = 0.364, statistic = 3.865
            49) parent_age <= 54; criterion = 0.231, statistic = 3.27
              50) residence0 <= 0; criterion = 0.487, statistic = 4.429
                51)*  weights = 31 
              50) residence0 > 0
                52)*  weights = 7 
            49) parent_age > 54
              53)*  weights = 10 
          48) gender0 > 0
            54) parent_salary <= 0.5444444; criterion = 0.489, statistic = 4.44
              55)*  weights = 18 
            54) parent_salary > 0.5444444
              56) house_area <= 0.516; criterion = 0.058, statistic = 16.64
                57)*  weights = 7 
              56) house_area > 0.516
                58)*  weights = 20 
        47) school_accreditation0 > 0
          59) gender0 <= 0; criterion = 0.038, statistic = 2.068
            60)*  weights = 76 
          59) gender0 > 0
            61) interest0 <= 0; criterion = 0.011, statistic = 1.632
              62) residence0 <= 0; criterion = 0.029, statistic = 1.954
                63)*  weights = 20 
              62) residence0 > 0
                64) parent_salary <= 0.5577778; criterion = 0.015, statistic = 2.193
                  65)*  weights = 7 
                64) parent_salary > 0.5577778
                  66)*  weights = 14 
            61) interest0 > 0
              67)*  weights = 33 
      46) type_school0 > 0
        68) interest0 <= 0; criterion = 1, statistic = 25.096
          69) interest1 <= 0; criterion = 1, statistic = 25.014
            70) residence0 <= 0; criterion = 0.569, statistic = 4.838
              71)*  weights = 32 
            70) residence0 > 0
              72) parent_salary <= 0.5544444; criterion = 0.663, statistic = 5.383
                73)*  weights = 7 
              72) parent_salary > 0.5544444
                74) house_area <= 0.572; criterion = 0.011, statistic = 1.616
                  75)*  weights = 17 
                74) house_area > 0.572
                  76)*  weights = 26 
          69) interest1 > 0
            77) average_grades+B <= 0; criterion = 0.912, statistic = 8.042
              78)*  weights = 8 
            77) average_grades+B > 0
              79)*  weights = 16 
        68) interest0 > 0
          80) average_gradesA <= 0; criterion = 0.987, statistic = 11.571
            81) residence0 <= 0; criterion = 0.48, statistic = 4.395
              82)*  weights = 9 
            81) residence0 > 0
              83) house_area <= 0.695; criterion = 0.647, statistic = 5.281
                84) parent_salary <= 0.5344444; criterion = 0.4, statistic = 4.026
                  85)*  weights = 13 
                84) parent_salary > 0.5344444
                  86)*  weights = 24 
              83) house_area > 0.695
                87)*  weights = 10 
          80) average_gradesA > 0
            88)*  weights = 8 
    45) average_gradesB > 0
      89) residence0 <= 0; criterion = 1, statistic = 27.838
        90) parent_age <= 54; criterion = 0.923, statistic = 8.29
          91) house_area <= 0.562; criterion = 0.558, statistic = 4.783
            92) interest2 <= 0; criterion = 0.161, statistic = 4.312
              93) gender0 <= 0; criterion = 0.347, statistic = 6.609
                94)*  weights = 13 
              93) gender0 > 0
                95)*  weights = 12 
            92) interest2 > 0
              96)*  weights = 9 
          91) house_area > 0.562
            97)*  weights = 18 
        90) parent_age > 54
          98)*  weights = 24 
      89) residence0 > 0
        99) house_area <= 0.633; criterion = 1, statistic = 27.626
          100) parent_was_in_college0 <= 0; criterion = 0.437, statistic = 4.195
            101)*  weights = 36 
          100) parent_was_in_college0 > 0
            102)*  weights = 23 
        99) house_area > 0.633
          103) parent_salary <= 0.5466667; criterion = 0.516, statistic = 4.568
            104)*  weights = 16 
          103) parent_salary > 0.5466667
            105)*  weights = 8 
Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 456  60
         0  44 440
                                          
               Accuracy : 0.896           
                 95% CI : (0.8754, 0.9142)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.792           
                                          
 Mcnemar's Test P-Value : 0.1413          
                                          
            Sensitivity : 0.9120          
            Specificity : 0.8800          
         Pos Pred Value : 0.8837          
         Neg Pred Value : 0.9091          
             Prevalence : 0.5000          
         Detection Rate : 0.4560          
   Detection Prevalence : 0.5160          
      Balanced Accuracy : 0.8960          
                                          
       'Positive' Class : 1               
                                          
fold = 2 

     Conditional inference tree with 53 terminal nodes

Response:  .outcome 
Inputs:  type_school0, school_accreditation0, gender0, interest3, interest2, interest1, interest0, residence0, parent_age, parent_salary, house_area, average_gradesA, average_grades+B, average_gradesB, average_grades+C, average_gradesC, average_grades+D, average_gradesD, average_gradesF, parent_was_in_college0 
Number of observations:  1000 

1) parent_salary <= 0.4166667; criterion = 1, statistic = 226.057
  2) average_gradesA <= 0; criterion = 1, statistic = 104.378
    3) house_area <= 0.559; criterion = 1, statistic = 43.299
      4) interest0 <= 0; criterion = 0.986, statistic = 11.464
        5) interest1 <= 0; criterion = 0.54, statistic = 4.692
          6) type_school0 <= 0; criterion = 0.024, statistic = 1.878
            7) average_grades+B <= 0; criterion = 0.058, statistic = 2.261
              8)*  weights = 104 
            7) average_grades+B > 0
              9) school_accreditation0 <= 0; criterion = 0.024, statistic = 1.875
                10)*  weights = 16 
              9) school_accreditation0 > 0
                11)*  weights = 30 
          6) type_school0 > 0
            12) house_area <= 0.298; criterion = 0.062, statistic = 2.299
              13)*  weights = 7 
            12) house_area > 0.298
              14)*  weights = 20 
        5) interest1 > 0
          15)*  weights = 10 
      4) interest0 > 0
        16) house_area <= 0.42; criterion = 0.808, statistic = 6.529
          17)*  weights = 7 
        16) house_area > 0.42
          18)*  weights = 13 
    3) house_area > 0.559
      19) parent_was_in_college0 <= 0; criterion = 1, statistic = 30.227
        20) type_school0 <= 0; criterion = 0.696, statistic = 5.6
          21) house_area <= 0.688; criterion = 0.864, statistic = 7.199
            22) interest0 <= 0; criterion = 0.022, statistic = 1.857
              23)*  weights = 13 
            22) interest0 > 0
              24)*  weights = 7 
          21) house_area > 0.688
            25)*  weights = 7 
        20) type_school0 > 0
          26)*  weights = 28 
      19) parent_was_in_college0 > 0
        27) parent_salary <= 0.2966667; criterion = 1, statistic = 17.824
          28)*  weights = 18 
        27) parent_salary > 0.2966667
          29) interest2 <= 0; criterion = 0.044, statistic = 12.462
            30)*  weights = 19 
          29) interest2 > 0
            31)*  weights = 9 
  2) average_gradesA > 0
    32) parent_age <= 54; criterion = 0.358, statistic = 3.84
      33)*  weights = 18 
    32) parent_age > 54
      34)*  weights = 7 
1) parent_salary > 0.4166667
  35) house_area <= 0.413; criterion = 1, statistic = 129.968
    36) type_school0 <= 0; criterion = 0.998, statistic = 19.583
      37) school_accreditation0 <= 0; criterion = 0.745, statistic = 6.509
        38)*  weights = 35 
      37) school_accreditation0 > 0
        39)*  weights = 12 
    36) type_school0 > 0
      40) interest3 <= 0; criterion = 0.993, statistic = 12.818
        41) gender0 <= 0; criterion = 0.889, statistic = 7.586
          42)*  weights = 13 
        41) gender0 > 0
          43)*  weights = 20 
      40) interest3 > 0
        44)*  weights = 15 
  35) house_area > 0.413
    45) average_gradesB <= 0; criterion = 1, statistic = 77.328
      46) type_school0 <= 0; criterion = 1, statistic = 52.367
        47) school_accreditation0 <= 0; criterion = 0.997, statistic = 14.096
          48) gender0 <= 0; criterion = 0.364, statistic = 3.865
            49) parent_age <= 54; criterion = 0.231, statistic = 3.27
              50) residence0 <= 0; criterion = 0.487, statistic = 4.429
                51)*  weights = 31 
              50) residence0 > 0
                52)*  weights = 7 
            49) parent_age > 54
              53)*  weights = 10 
          48) gender0 > 0
            54) parent_salary <= 0.5444444; criterion = 0.489, statistic = 4.44
              55)*  weights = 18 
            54) parent_salary > 0.5444444
              56) house_area <= 0.516; criterion = 0.058, statistic = 16.64
                57)*  weights = 7 
              56) house_area > 0.516
                58)*  weights = 20 
        47) school_accreditation0 > 0
          59) gender0 <= 0; criterion = 0.038, statistic = 2.068
            60)*  weights = 76 
          59) gender0 > 0
            61) interest0 <= 0; criterion = 0.011, statistic = 1.632
              62) residence0 <= 0; criterion = 0.029, statistic = 1.954
                63)*  weights = 20 
              62) residence0 > 0
                64) parent_salary <= 0.5577778; criterion = 0.015, statistic = 2.193
                  65)*  weights = 7 
                64) parent_salary > 0.5577778
                  66)*  weights = 14 
            61) interest0 > 0
              67)*  weights = 33 
      46) type_school0 > 0
        68) interest0 <= 0; criterion = 1, statistic = 25.096
          69) interest1 <= 0; criterion = 1, statistic = 25.014
            70) residence0 <= 0; criterion = 0.569, statistic = 4.838
              71)*  weights = 32 
            70) residence0 > 0
              72) parent_salary <= 0.5544444; criterion = 0.663, statistic = 5.383
                73)*  weights = 7 
              72) parent_salary > 0.5544444
                74) house_area <= 0.572; criterion = 0.011, statistic = 1.616
                  75)*  weights = 17 
                74) house_area > 0.572
                  76)*  weights = 26 
          69) interest1 > 0
            77) average_grades+B <= 0; criterion = 0.912, statistic = 8.042
              78)*  weights = 8 
            77) average_grades+B > 0
              79)*  weights = 16 
        68) interest0 > 0
          80) average_gradesA <= 0; criterion = 0.987, statistic = 11.571
            81) residence0 <= 0; criterion = 0.48, statistic = 4.395
              82)*  weights = 9 
            81) residence0 > 0
              83) house_area <= 0.695; criterion = 0.647, statistic = 5.281
                84) parent_salary <= 0.5344444; criterion = 0.4, statistic = 4.026
                  85)*  weights = 13 
                84) parent_salary > 0.5344444
                  86)*  weights = 24 
              83) house_area > 0.695
                87)*  weights = 10 
          80) average_gradesA > 0
            88)*  weights = 8 
    45) average_gradesB > 0
      89) residence0 <= 0; criterion = 1, statistic = 27.838
        90) parent_age <= 54; criterion = 0.923, statistic = 8.29
          91) house_area <= 0.562; criterion = 0.558, statistic = 4.783
            92) interest2 <= 0; criterion = 0.161, statistic = 4.312
              93) gender0 <= 0; criterion = 0.347, statistic = 6.609
                94)*  weights = 13 
              93) gender0 > 0
                95)*  weights = 12 
            92) interest2 > 0
              96)*  weights = 9 
          91) house_area > 0.562
            97)*  weights = 18 
        90) parent_age > 54
          98)*  weights = 24 
      89) residence0 > 0
        99) house_area <= 0.633; criterion = 1, statistic = 27.626
          100) parent_was_in_college0 <= 0; criterion = 0.437, statistic = 4.195
            101)*  weights = 36 
          100) parent_was_in_college0 > 0
            102)*  weights = 23 
        99) house_area > 0.633
          103) parent_salary <= 0.5466667; criterion = 0.516, statistic = 4.568
            104)*  weights = 16 
          103) parent_salary > 0.5466667
            105)*  weights = 8 
Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 456  60
         0  44 440
                                          
               Accuracy : 0.896           
                 95% CI : (0.8754, 0.9142)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.792           
                                          
 Mcnemar's Test P-Value : 0.1413          
                                          
            Sensitivity : 0.9120          
            Specificity : 0.8800          
         Pos Pred Value : 0.8837          
         Neg Pred Value : 0.9091          
             Prevalence : 0.5000          
         Detection Rate : 0.4560          
   Detection Prevalence : 0.5160          
      Balanced Accuracy : 0.8960          
                                          
       'Positive' Class : 1               
                                          

Additional insights reveal that attributes such as school accreditation, parent was in collage contribute to high impurity. In contrast, Parent salary is chosen as the root due to its high purity. Given the balanced class labels in our dataset and minimal variations in accuracy across folds, the result yields consistent tree structures, with only two distinct trees observed for all folds. For further details, please refer to the index. The overall accuracy consistently surpasses 86%, affirming the model’s efficacy

final analysis

The C4.5 model emerged as the top-performing evaluation model, achieving an impressive accuracy rate of 94% to 97%. It was followed by the ID3 model, which demonstrated slightly lower accuracy ranging from 86% to 89%. Lastly, the cart model exhibited an accuracy rate of 86%.

the C4.5 gave better result than ID3 and Cart because they both are biased to multivalued where C4.5 normalized parent salary and house area which are multivalue attributes

C4.5 and ID3 models, the parent’s salary served as the root feature, indicating that the financial circumstances of the student are a crucial factor for contemporary universities.

index

all trees of C4.5

library(caret)
library(rpart)
library(dplyr)
library(rpart.plot)

folds <- c(2, 3, 4)  # Values of xval (number of folds) to try

for(folds in folds){
for (fold_val in 1:folds) {
  cat("fold =", fold_val, "\n")
  
  trctrl <- trainControl(method = "cv", number = fold_val, savePredictions = TRUE)
  c45_fit <- train(factor(will_go_to_college) ~ ., data = data, method = "J48", trControl = trctrl)
  
   plot(c45_fit$finalModel, main = paste("Decision Tree - Fold", fold_val))
}
  # Get predicted values
  predictions <- predict(c45_fit, newdata = data)
  
  # Create confusion matrix
  confusion_matrix <- confusionMatrix(predictions, data$will_go_to_college)
  
  # Print the confusion matrix
  print(confusion_matrix)
  
  # Print accuracy for each fold
  pred <- c45_fit$pred
  pred$equal <- ifelse(pred$pred == pred$obs, 1, 0)
  eachfold <- pred %>%
    group_by(Resample) %>%
    summarise_at(vars(equal),
                 list(Accuracy = mean))
  print(eachfold)
  
  # Plot decision tree for each fold
 

}
fold = 1 
fold = 2 
Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 486  10
         0  14 490
                                          
               Accuracy : 0.976           
                 95% CI : (0.9645, 0.9846)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.952           
                                          
 Mcnemar's Test P-Value : 0.5403          
                                          
            Sensitivity : 0.9720          
            Specificity : 0.9800          
         Pos Pred Value : 0.9798          
         Neg Pred Value : 0.9722          
             Prevalence : 0.5000          
         Detection Rate : 0.4860          
   Detection Prevalence : 0.4960          
      Balanced Accuracy : 0.9760          
                                          
       'Positive' Class : 1               
                                          
fold = 1 
fold = 2 
fold = 3 
Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 465  17
         0  35 483
                                          
               Accuracy : 0.948           
                 95% CI : (0.9324, 0.9609)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.896           
                                          
 Mcnemar's Test P-Value : 0.0184          
                                          
            Sensitivity : 0.9300          
            Specificity : 0.9660          
         Pos Pred Value : 0.9647          
         Neg Pred Value : 0.9324          
             Prevalence : 0.5000          
         Detection Rate : 0.4650          
   Detection Prevalence : 0.4820          
      Balanced Accuracy : 0.9480          
                                          
       'Positive' Class : 1               
                                          
fold = 1 
fold = 2 
fold = 3 
fold = 4 
Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 486  10
         0  14 490
                                          
               Accuracy : 0.976           
                 95% CI : (0.9645, 0.9846)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.952           
                                          
 Mcnemar's Test P-Value : 0.5403          
                                          
            Sensitivity : 0.9720          
            Specificity : 0.9800          
         Pos Pred Value : 0.9798          
         Neg Pred Value : 0.9722          
             Prevalence : 0.5000          
         Detection Rate : 0.4860          
   Detection Prevalence : 0.4960          
      Balanced Accuracy : 0.9760          
                                          
       'Positive' Class : 1               
                                          

all trees of ID3

library(caret)
library(partykit)
library(dplyr)

fold <- c(4, 3, 2)  # Values of xval (number of folds) to try

for (fold in fold) {
   cat("fold =", fold, "\n")
  for(i in 1:fold){
  trctrl <- trainControl(method = "cv", number = fold, savePredictions = TRUE)
  
  id_fit <- train(factor(will_go_to_college) ~ ., data = data, method = "ctree", trControl = trctrl, subset = trctrl$indexes[[i]])
  
    plot(id_fit$finalModel, main = paste("Decision Tree (Fold", fold,"-", i, ")"))
}
   
  # Get predicted values
  predictions <- predict(id_fit, newdata = data)
  
  # Create confusion matrix
  confusion_matrix <- confusionMatrix(predictions, data$will_go_to_college)
  
  # Print the confusion matrix
  print(confusion_matrix)
  
  # Print accuracy for each fold
  pred <- id_fit$pred
  pred$equal <- ifelse(pred$pred == pred$obs, 1, 0)
  eachfold <- pred %>%
    group_by(Resample) %>%
    summarise_at(vars(equal), list(Accuracy = mean))
  print(eachfold)

  # Plot decision tree


}
fold = 4 
Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 457  69
         0  43 431
                                          
               Accuracy : 0.888           
                 95% CI : (0.8668, 0.9069)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : < 2e-16         
                                          
                  Kappa : 0.776           
                                          
 Mcnemar's Test P-Value : 0.01816         
                                          
            Sensitivity : 0.9140          
            Specificity : 0.8620          
         Pos Pred Value : 0.8688          
         Neg Pred Value : 0.9093          
             Prevalence : 0.5000          
         Detection Rate : 0.4570          
   Detection Prevalence : 0.5260          
      Balanced Accuracy : 0.8880          
                                          
       'Positive' Class : 1               
                                          
fold = 3 
Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 456  60
         0  44 440
                                          
               Accuracy : 0.896           
                 95% CI : (0.8754, 0.9142)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.792           
                                          
 Mcnemar's Test P-Value : 0.1413          
                                          
            Sensitivity : 0.9120          
            Specificity : 0.8800          
         Pos Pred Value : 0.8837          
         Neg Pred Value : 0.9091          
             Prevalence : 0.5000          
         Detection Rate : 0.4560          
   Detection Prevalence : 0.5160          
      Balanced Accuracy : 0.8960          
                                          
       'Positive' Class : 1               
                                          
fold = 2 
Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 456  60
         0  44 440
                                          
               Accuracy : 0.896           
                 95% CI : (0.8754, 0.9142)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.792           
                                          
 Mcnemar's Test P-Value : 0.1413          
                                          
            Sensitivity : 0.9120          
            Specificity : 0.8800          
         Pos Pred Value : 0.8837          
         Neg Pred Value : 0.9091          
             Prevalence : 0.5000          
         Detection Rate : 0.4560          
   Detection Prevalence : 0.5160          
      Balanced Accuracy : 0.8960          
                                          
       'Positive' Class : 1               
                                          

all trees of Cart

library(caret)
library(rpart)
library(rpart.plot)

set.seed(123) # Random seed
fold <- c(2, 3, 4)  # Values of xval (number of folds) to try

for (fold in fold) {
  cat("fold"=fold)
  for(i in 1:fold){

trctrl <- trainControl(method = "cv", number = fold, savePredictions = TRUE)
dt_fit <- train(factor(will_go_to_college) ~ ., data = data, method = "rpart1SE", trControl = trctrl)

 # Plot decision tree
  cat("Decision Tree - Fold", i, ":\n")
  rpart.plot(dt_fit$finalModel)
  
}
# Get predicted values
predictions <- predict(dt_fit, newdata = data)

# Create confusion matrix
confusion_matrix <- confusionMatrix(predictions, data$will_go_to_college)

# Print the confusion matrix
print(confusion_matrix)

#print each flod
pred <- dt_fit$pred
pred$equal <- ifelse(pred$pred == pred$obs, 1,0)
eachfold <- pred %>%                                        
  group_by(Resample) %>%                         
  summarise_at(vars(equal),                     
               list(Accuracy = mean))              
print(eachfold  )

  }
2Decision Tree - Fold 1 :
Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
To silence this warning:
    Call rpart.plot with roundint=FALSE,
    or rebuild the rpart model with model=TRUE.
Decision Tree - Fold 2 :

Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 430  67
         0  70 433
                                          
               Accuracy : 0.863           
                 95% CI : (0.8401, 0.8837)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.726           
                                          
 Mcnemar's Test P-Value : 0.8643          
                                          
            Sensitivity : 0.8600          
            Specificity : 0.8660          
         Pos Pred Value : 0.8652          
         Neg Pred Value : 0.8608          
             Prevalence : 0.5000          
         Detection Rate : 0.4300          
   Detection Prevalence : 0.4970          
      Balanced Accuracy : 0.8630          
                                          
       'Positive' Class : 1               
                                          
3Decision Tree - Fold 1 :

Decision Tree - Fold 2 :

Decision Tree - Fold 3 :

Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 430  67
         0  70 433
                                          
               Accuracy : 0.863           
                 95% CI : (0.8401, 0.8837)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.726           
                                          
 Mcnemar's Test P-Value : 0.8643          
                                          
            Sensitivity : 0.8600          
            Specificity : 0.8660          
         Pos Pred Value : 0.8652          
         Neg Pred Value : 0.8608          
             Prevalence : 0.5000          
         Detection Rate : 0.4300          
   Detection Prevalence : 0.4970          
      Balanced Accuracy : 0.8630          
                                          
       'Positive' Class : 1               
                                          
4Decision Tree - Fold 1 :

Decision Tree - Fold 2 :

Decision Tree - Fold 3 :

Decision Tree - Fold 4 :

Confusion Matrix and Statistics

          Reference
Prediction   1   0
         1 430  67
         0  70 433
                                          
               Accuracy : 0.863           
                 95% CI : (0.8401, 0.8837)
    No Information Rate : 0.5             
    P-Value [Acc > NIR] : <2e-16          
                                          
                  Kappa : 0.726           
                                          
 Mcnemar's Test P-Value : 0.8643          
                                          
            Sensitivity : 0.8600          
            Specificity : 0.8660          
         Pos Pred Value : 0.8652          
         Neg Pred Value : 0.8608          
             Prevalence : 0.5000          
         Detection Rate : 0.4300          
   Detection Prevalence : 0.4970          
      Balanced Accuracy : 0.8630          
                                          
       'Positive' Class : 1               
                                          

---
title: "R Notebook"
output: html_notebook
---

Important packages:

```{r}
setwd("Dataset")
Dataset <- read.csv("data.csv")
Preprocessed_dataset <-  read.csv("preprocessed_dataset.csv") 
if(!require(ggplot2)){
install.packages("ggplot2")}
library(ggplot2)
if(!require(dplyr)){
install.packages("dplyr")}
library(dplyr) 
if(!require(dplyr)){
install.packages("dplyr")}
library(dplyr)
if(!require(ltm)){
install.packages("ltm")}
library(ltm)
if (!require(cluster, quietly = TRUE)) {
  install.packages("cluster")
}
if (!require(factoextra, quietly = TRUE)) {
  install.packages("factoextra")
}
 
if (!require(caret, quietly = TRUE)) {
  install.packages("caret")
}

if (!require(partykit, quietly = TRUE)) {
  install.packages("partykit")
}

if (!require(rpart, quietly = TRUE)) {
  install.packages("rpart")
}
if (!require(rpart.plot, quietly = TRUE)) {
  install.packages("rpart.plot")
}
if (!require(tidyverse, quietly = TRUE)) {
  install.packages("tidyverse")
}
```

# IT326 Project {style="color: gray"}

\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_\_

## The Goal

Our primary objective of this analysis is to classify whatever a student will go to college or not using the classification methods and to identify the main factors and reasons why students are less likely to pursue higher education indicated by "will_go_to_college" being 'False'. By leveraging the provided dataset with attributes such as school type, school accreditation, gender, interest in college, residence, we aim to discover the most influential variables and their relationships with the decision not to attend college.

## The Source

Kaggle.com

## URL :

<https://www.kaggle.com/datasets/saddamazyazy/go-to-college-dataset>

## General information

-   Number of attributes : 11
-   Number of rows (objects) : 1000
-   The class label: The class label in our project is the attribute "will_go_to_college". This attribute is binary, meaning that it can take on two values: True for yes or False for no. The value of this attribute will be the target variable that we are trying to predict during our project.

------------------------------------------------------------------------

+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+
| Attribute             | Description                                                | Type      | Possible values                                                           |
+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+
| type_school           | The type of school the student attends                     | Binary    | Academic / Vocational                                                     |
+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+
| school_accreditation  | The quality if school                                      | Binary    | A / B (A is better than B)                                                |
+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+
| gender                | The student's gender                                       | Binary    | Male / Female                                                             |
+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+
| interest              | The student's interest in going to college                 | Nominal   | Very interested /Interested / Less Interested / Not Interested /Uncertain |
+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+
| residence             | The student's residence                                    | Binary    | Urban / Rural                                                             |
+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+
| parent_age            | The age of the student's parents                           | Numeric   | 40 - 65                                                                   |
+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+
| parent_salary         | The monthly salary of the student's parents in IDR/Rupiah. | Numeric   | 1000K - 10M                                                               |
|                       |                                                            |           |                                                                           |
|                       | [1Rupiah = 0.00024SAR]                                     |           |                                                                           |
+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+
| house_area            | The size of the student's house in meter square            | Numeric   | 20 - 120                                                                  |
+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+
| average_grades        | The student's average grades in school.                    | Numeric   | 75 - 98                                                                   |
+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+
| parent_was_in_college | Whether the student's parents attended college.            | Binary    | True - False                                                              |
+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+
| will_go_to_college    | Whether the student plans to go to college.                | Binary    | True - False                                                              |
+-----------------------+------------------------------------------------------------+-----------+---------------------------------------------------------------------------+

## Sample of our data

```{r}
head(Dataset)
```

Here are a sample of 6 row from our dataset.

## Missing values

```{r}
sum(is.na(Dataset))
```

There are no missing values in our dataset.

## Statistical graphs

Graph 1:

```{r}
df =data.frame(Dataset)
ggplot(data=df, aes(x = interest, fill = will_go_to_college)) +
  geom_bar() +
  scale_x_discrete(limits = c('Not Interested', 'Less Interested', 'Uncertain', 'Interested', 'Very Interested')) +
  labs(title = 'College interest vs College attendance ') +
  scale_fill_manual(values = c("True" = "antiquewhite2", "False" = "antiquewhite3")) +
  theme_minimal()


```

According to the graph, whether students are interested in going to college or not does not affect whether they actually end up attending Collage . There is a group of individuals who were interested in attending but did not receive acceptance, while others who were not interested were accepted.

Graph 2:

```{r}

filtered_True =filter(Dataset, will_go_to_college == 'True')
filtered_False =subset(Dataset, will_go_to_college =='False')

ggplot() +
  geom_density(data = filtered_True, aes(x = average_grades, fill = "Going to College"), alpha = 0.5) +
  geom_density(data = filtered_False, aes(x = average_grades, fill = "NOT Going to College"), alpha = 0.5) +
  labs(x = "Average Grades", y = "Density") +
  ggtitle("Comparison of Average Grades for Students Going to College and NOT Going to College") +
  scale_fill_manual(values = c("Going to College" = "antiquewhite4", "NOT Going to College" = "antiquewhite1"))

```

The graph shows that the average grades for students who had accepted to go to college were higher than those who did not enter college , and this indicates the existence of a correlation between those who going to college and the average grades

Graph 3:

```{r}
Dataset_percentage <- Dataset %>%
  group_by(type_school) %>%
  summarise(percentage = mean(will_go_to_college == "True") * 100)

# Create a percentage chart
ggplot(Dataset_percentage, aes(x = type_school, y = percentage, fill = type_school)) +
  geom_bar(stat = "identity") +
  labs(title = "Percentage of Students Going to College by Type of School",
       x = "Type of School",
       y = "Percentage") +
  scale_fill_manual(values = c("Academic" = "antiquewhite3", "Vocational" = "antiquewhite2")) +
  theme_minimal()

 
```

This graph shows the impact of the type of high school attended by students on their college attendance . Based on the bar chart:

-   among students from Academic high schools, 313 are going to college, and 296 are not.

-   among students from Vocational high schools, 187 are going to college, and 204 are not

These information tell us that a higher proportion of students from academic high schools are going to college compared to those from vocational schools which suggest that the type of school attended.

Graph 4:

```{r}
ggplot(Dataset, aes(x = average_grades)) +
  geom_histogram(binwidth = 5, fill = "antiquewhite2", color = "antiquewhite4") +
  labs(title = "Distribution of students' grades",
       x = "Students' average grades",
       y = "Frequency") +
  theme_minimal()
```

This histogram show us that the majority of the students in the dataset are performing well since it seems like their grades are spanning between 75 and 98. This analysis will help us determine whether the academic performance level of students is a contributing factor to their college attendance or not.

## Statistical Measures

1.  The student's academic performance analysis:

```{r}
summary(Dataset$average_grades)

```

The student grades in our dataset range from 75.00 to 98.00, with a median of 85.58 and an average of 86.10. This suggests that most students are doing well as none of them have average grades below 50. However, it's interesting to note that some students have much higher or lower grades than the average, mainly due to the wide range of grades..

2.  The socioeconomic status of students' families analysis:

```{r}
summary(Dataset$parent_salary)
```

In the dataset, we've got students parents with salaries ranging from 1,000,000 to 10,000,000 IDR/Rupiah. The median salary is 5,440,000 IDR/Rupiah, and the average is 5,381,570 IDR/Rupiah. This data tells us that many parents in our dataset earn less than the average salary in Indonesia, which is 146,000,000 IDR. This suggests that quite a few students in our dataset come from families with limited finances. And this financial situation could certainly impact their ability to get through college.

```{r}
summary(Dataset$house_area)
```

Additionally we can utilize the house area attribute to gain a deeper understanding of the socioeconomic status of students' families, where students with houses significantly larger than the mean might indicate a higher socioeconomic status, while those with houses considerably smaller than the mean might reflect a comparatively lower socioeconomic status. Based on the shown output, the house areas range from [20.00-120.00 ㎡]. The median house area is 75.50 ㎡ indicates that families with house areas around this value likely have moderate socioeconomic status with houses that neither very small nor very large.

3.  Understanding Parent Age Range and Variation in its Values

```{r}
summary(Dataset$parent_age)
SD=sd(Dataset$parent_age)
MeanAge=mean(Dataset$parent_age)
cat("coefficient of variation:",SD/MeanAge*100,"%")

```

This summary provides the range for age attribute [40,65] which indicates that all parent in middle age during this age parent have more concern about their children , the coefficient of variation= 6.7% which indicates lower variation ,and the value of attribute parent_agerare are relatively close to the mean overall 25% of them have an age below or equal to 50 , 75% have an age below or equal to 54 and the median value is 52

## Outliers analysis

```{r}
###parent age outliers
quartiles <- quantile(Dataset$parent_age, probs=c(.25, .75), na.rm = FALSE)
IQR <- IQR(Dataset$parent_age)

Lower <- quartiles[1] - 1.5*IQR
Upper <- quartiles[2] + 1.5*IQR 

data_no_outlier <- subset(Dataset, Dataset$parent_age > Lower & Dataset$parent_age < Upper)
dim(data_no_outlier)

###parent salary outliers
quartiles <- quantile(Dataset$parent_salary, probs=c(.25, .75), na.rm = FALSE)
IQR <- IQR(Dataset$parent_salary)

Lower <- quartiles[1] - 1.5*IQR
Upper <- quartiles[2] + 1.5*IQR 

data_no_outlier <- subset(data_no_outlier, data_no_outlier$parent_salary> Lower & data_no_outlier$parent_salary < Upper)
dim(data_no_outlier)


###averge grades outliers
quartiles <- quantile(Dataset$average_grades, probs=c(.25, .75), na.rm = FALSE)
IQR <- IQR(Dataset$average_grades)

Lower <- quartiles[1] - 1.5*IQR
Upper <- quartiles[2] + 1.5*IQR 

data_no_outlier <- subset(data_no_outlier, data_no_outlier$average_grades> Lower & data_no_outlier$average_grades < Upper)
dim(data_no_outlier)


###house area outliers
quartiles <- quantile(Dataset$house_area, probs=c(.25, .75), na.rm = FALSE)
IQR <- IQR(Dataset$house_area)

Lower <- quartiles[1] - 1.5*IQR
Upper <- quartiles[2] + 1.5*IQR 

data_no_outlier <- subset(data_no_outlier, data_no_outlier$house_area> Lower & data_no_outlier$house_area < Upper)

Founded_Outliers=data.frame(anti_join(Dataset,data_no_outlier))
print(Founded_Outliers)
```

After conducting data analysis and identifying outliers, our inspection reveals that the detected outliers represent inherent variation within the population. Regarding Parent_age, outliers are observed for values below 44 and above 65. However, it should be noted the age from 40 to 65 fall within the expected mean of our dataset meaning that it doesn't indicate that they are outliers . For parent_salary, we found two outliers: one below 1,326,250 ind ≈ 85 USD and another above 9,416,250 ind ≈ 606 USD. The minimum and maximum values were determined to be 1,000,000 ind ≈ 64 USD and 10,000,000 ind ≈ 644 USD, respectively. In the case of grades, twelve outliers were identified, ranging from below 76 to above 97. Nevertheless, since the data falls within the acceptable range of 0 to 100, these outliers should be retained as they are still considered normal and within the usual grade range. Finally, for house_area, we found eleven outliers below 34.4m and above 115m, with the minimum being 20m and the maximum being 120m. However, these values are still considered typical for the population.

## Normalization

```{r}
normalize <- function(x) {return((x-min(x))/ (max(x)-min(x)))}
datasetWithoutNormalization<-Dataset
Dataset$parent_salary<-normalize(datasetWithoutNormalization$parent_salary)
Dataset$house_area<-normalize(datasetWithoutNormalization$house_area)
print(Dataset)
```

We applied normalization to the 'parent_salary' and 'house_area' attributes, scaling their values to a range between 0 and 1. This normalization process greatly facilitates data handling and analysis, ensuring that these attributes are on a consistent scale. Which will improve the reliability of our data analysis and enable better conclusions to be drawn from the dataset. Normalization is a crucial step in preparing the data for modeling, as it prevents attributes with larger numerical ranges from dominating the analysis and ensures fair treatment for all features.

## Discretization

```{r}


Dataset$average_grades [Dataset$average_grades >= 95] <- '+A'
Dataset$average_grades [95 >Dataset$average_grades & Dataset$average_grades >= 90] <- 'A'
Dataset$average_grades [90 >Dataset$average_grades & Dataset$average_grades >= 85] <- '+B'
Dataset$average_grades [85 >Dataset$average_grades & Dataset$average_grades >= 80] <- 'B'
Dataset$average_grades [80 >Dataset$average_grades & Dataset$average_grades >= 75] <- '+C'
Dataset$average_grades [75 >Dataset$average_grades & Dataset$average_grades >= 70] <- 'C'
Dataset$average_grades [70 >Dataset$average_grades & Dataset$average_grades >= 65] <- '+D'
Dataset$average_grades [65 >Dataset$average_grades & Dataset$average_grades >= 60] <- 'D'
Dataset$average_grades [60 >Dataset$average_grades & Dataset$average_grades >= 0] <- 'F'
Dataset$average_grades <- as.character(Dataset$average_grades )
print(Dataset)
```

We transformed the parent_age attribute into intervals by dividing the values to be fall on one of two possible interval labels with equal width which is(40,50],(50,60] by discretization the values well be simpler to classify or perform other methods that can help us later in our model.

and to better utilize and interpret the grades attributes for each student, we have converted the numeric grades into letter grades (A+, A, B+, B, C+, C, D+, D, F). This transformation was undertaken to focus on the general letter grade representation rather than the precise numerical values.

## Encoding

```{r}
Dataset$parent_was_in_college[Dataset$parent_was_in_college=="TRUE"]<-1
Dataset$parent_was_in_college[Dataset$parent_was_in_college=="True"]<-1
Dataset$parent_was_in_college[Dataset$parent_was_in_college=="FALSE"]<-0
Dataset$parent_was_in_college[Dataset$parent_was_in_college=="False"]<-0

Dataset$will_go_to_college[Dataset$will_go_to_college=="TRUE"]<-0
Dataset$will_go_to_college[Dataset$will_go_to_college=="True"]<-0
Dataset$will_go_to_college[Dataset$will_go_to_college=="FALSE"]<-1
Dataset$will_go_to_college[Dataset$will_go_to_college=="False"]<-1

Dataset$gender[Dataset$gender=="Female"]<-1
Dataset$gender[Dataset$gender=="Male"]<-0

Dataset$school_accreditation[Dataset$school_accreditation=="A"]<-1
Dataset$school_accreditation[Dataset$school_accreditation=="B"]<-0

Dataset$interest[Dataset$interest=="Very Interested"]<-4
Dataset$interest[Dataset$interest=="Interested"]<-3
Dataset$interest[Dataset$interest=="Less Interested"]<-2
Dataset$interest[Dataset$interest=="Not Interested"]<-1
Dataset$interest[Dataset$interest=="Uncertain"]<-0


Dataset$type_school[Dataset$type_school=="Academic"]<-1 
Dataset$type_school[Dataset$type_school=="Vocational"]<-0

Dataset$residence[Dataset$residence=="Urban"]<-1
Dataset$residence[Dataset$residence=="Rural"]<-0
print(Dataset)
```

Since encoding is an important step in data preprocessing that enables the use of categorical data in various data analysis and machine learning tasks, we encoded attributes like the 'parent was in college' attribute from (True, False) to (1, 0), and 'will go to college' from (True, False) to (0, 1). This encoding is carried out as we aim to predict the influencing factors. Additionally, we encoded the 'gender' attribute from (Female, Male) to (1, 0), 'school accreditation' from (A, B) to (1, 0), 'type_school' from (Academic, Vocational) to (1, 0), 'residence' from (Urban, Rural) to (1, 0), and 'interest' from (Very interested ,Interested , Less Interested , Not Interested ,Uncertain ) to (4,3,2, 1, 0) respectively. Encoding serves to simplify the data, reduce complexity, and enhance its suitability for modeling purposes.

## Correlation analysis Chi square test for nominal attribute:

```{r}

#1
C=chisq.test(Dataset$type_school , Dataset$will_go_to_college)
print(C)
#2

C=chisq.test(Dataset$school_accreditation , Dataset$will_go_to_college)
print(C)
#3
C=chisq.test(Dataset$gender , Dataset$will_go_to_college)
print(C)
#4
C=chisq.test(Dataset$interest , Dataset$will_go_to_college)
print(C)

#5
C=chisq.test(Dataset$residence , Dataset$will_go_to_college)
print(C)

#6
C=chisq.test(Dataset$average_grades , Dataset$will_go_to_college)
print(C)

#7
C=chisq.test(Dataset$parent_was_in_college , Dataset$will_go_to_college)
print(C)
```

All the attributes have X-square greater than the p-value which indicate a some association with the class label; therefore we reject the null hypothesis

we noticed for 'interest' and 'average grade' the analysis shows that X-square is much larger than p-value indicate the significant association of the two attributes with the decision of the student to go to the collage or not

## Correlation coefficient analysis for numeric attribute:

```{r}

biserial.cor(Dataset$parent_salary,Dataset$will_go_to_college, c("all.obs", "complete.obs"), level = 1)
biserial.cor(Dataset$house_area,Dataset$will_go_to_college, c("all.obs", "complete.obs"), level = 1)
biserial.cor(Dataset$parent_age,Dataset$will_go_to_college,c("all.obs", "complete.obs"), level = 1)
 
 

```

the analysis shows moderate correlation coefficient for parent salary and house area with the class label which indicate that they are relevant factors meaning that the higher the parent salary and the larger house area the higher probability for a student to enroll in a collage

where is the on other hand, the correlation coefficient for the parent age is very small which indicate that the parent age has little impact to the probability for student to enroll in a collage

## Feature selection:

ultimately based on the analysis of the correlation that we conducted on the relationship of the dataset attributes with the class label, and the understanding of the data and the context of each attribute and potential relevance to the class label we decided to not delete any of the attribute

## Clustering **Analysis:**

In this analysis, we apply K-means clustering to the dataset using different values of K. K-means clustering is an unsupervised learning algorithm that partitions the data into K clusters based on similarity. We will explore three different values of K and evaluate the clustering results using various metrics.

### Removing the class label and preparing the dataset for Clustering

```{r}
 
original_data <- Preprocessed_dataset

# Remove any non-numeric attributes
numeric_data <- original_data[, sapply(original_data, is.numeric)]

# Remove the class label 'will_go_to_college'
numeric_data <- numeric_data[, !(names(numeric_data) == 'will_go_to_college')]


# Print the dataset "numeric_data" to make sure it's prepared for clustering
print(numeric_data)

# Scaling the dataset
# numeric_data <- scale(numeric_data)

 
```

Now, the 'numeric_dataset' dataset contains only numeric attributes without the class label, which makes it ready for the clustering process.

## K=2

```{r}
# k-means clustering set a seed for random number generation to make the results reproducible 
set.seed(8953)

# run kmeans clustering to find 2 clusters
kmeans.result <- kmeans(numeric_data, 2)

# visualize clustering
library(factoextra)
fviz_cluster(kmeans.result, data = numeric_data)

# print the clustering result
print(kmeans.result)


```

#### The Silhouette coefficient

```{r}
#average for each cluster 
avg_sil <- silhouette(kmeans.result$cluster, dist(numeric_data)) 

#k-means clustering with estimating k and initializations 
fviz_silhouette(avg_sil)

```

#### The total within-cluster sum of squares

```{r}
# Calculate total within-cluster sum of squares
total_withinss <- kmeans.result$tot.withinss
cat("Total Within-Cluster Sum of Squares:", sum(total_withinss), "\n")

true_labels <- c(1, 1, 2, 1, 2, 2, 3, 3, 4, 4)  # Adjust based on your actual true labels

cluster_assignments <- kmeans.result$cluster
 
```

#### BCubed recall and precision

```{r}

# Calculate BCubed precision
precision <- 0
for (i in unique(true_labels)) {
  cluster_indices <- which(true_labels == i)
  precision <- precision + sum((table(cluster_assignments[cluster_indices]) * (table(cluster_assignments[cluster_indices]) - 1)) / sum(table(cluster_assignments[cluster_indices])))
}
precision <- precision / sum(table(cluster_assignments))

# Calculate BCubed recall
recall <- 0
for (j in unique(cluster_assignments)) {
  cluster_indices <- which(cluster_assignments == j)
  recall <- recall + sum((table(true_labels[cluster_indices]) * (table(true_labels[cluster_indices]) - 1)) / sum(table(true_labels[cluster_indices])))
}
recall <- recall / sum(table(true_labels))

cat("BCubed Precision:", precision, "\n")
cat("BCubed Recall:", recall, "\n")


```

## K=2

```{r}
# k-means clustering set a seed for random number generation to make the results reproducible 
set.seed(8953)

# run kmeans clustering to find 2 clusters
kmeans.result <- kmeans(numeric_data, 2)

# visualize clustering
library(factoextra)
fviz_cluster(kmeans.result, data = numeric_data)

# print the clustering result
print(kmeans.result)


```

#### The Silhouette coefficient

```{r}
#average for each cluster 
avg_sil <- silhouette(kmeans.result$cluster, dist(numeric_data)) 

#k-means clustering with estimating k and initializations 
fviz_silhouette(avg_sil)

```

#### The total within-cluster sum of squares

```{r}
# Calculate total within-cluster sum of squares
total_withinss <- kmeans.result$tot.withinss
cat("Total Within-Cluster Sum of Squares:", sum(total_withinss), "\n")

true_labels <- c(1, 1, 2, 1, 2, 2, 3, 3, 4, 4)  # Adjust based on your actual true labels

cluster_assignments <- kmeans.result$cluster
 
```

#### BCubed recall and precision

```{r}

# Calculate BCubed precision
precision <- 0
for (i in unique(true_labels)) {
  cluster_indices <- which(true_labels == i)
  precision <- precision + sum((table(cluster_assignments[cluster_indices]) * (table(cluster_assignments[cluster_indices]) - 1)) / sum(table(cluster_assignments[cluster_indices])))
}
precision <- precision / sum(table(cluster_assignments))

# Calculate BCubed recall
recall <- 0
for (j in unique(cluster_assignments)) {
  cluster_indices <- which(cluster_assignments == j)
  recall <- recall + sum((table(true_labels[cluster_indices]) * (table(true_labels[cluster_indices]) - 1)) / sum(table(true_labels[cluster_indices])))
}
recall <- recall / sum(table(true_labels))

cat("BCubed Precision:", precision, "\n")
cat("BCubed Recall:", recall, "\n")


```

## K=6

```{r}
# k-means clustering set a seed for random number generation to make the results reproducible 
set.seed(8953)

# run kmeans clustering to find 6 clusters
kmeans.result <- kmeans(numeric_data, 6)

# visualize clustering
library(factoextra)
fviz_cluster(kmeans.result, data = numeric_data)

# print the clustering result
print(kmeans.result)

library(cluster) 
#average for each cluster 
avg_sil <- silhouette(kmeans.result$cluster,dist(numeric_data)) 

#k-means clustering with estimating k and initializations 
fviz_silhouette(avg_sil)
```

### The optimal number of cluster

To find the optimal number of clusters to use in the k-means algorithm, we'll use the **fviz_nbclust()** function from the **factoextra**package to create a plot of the number of clusters vs. the total within sum of squares

```{r}
# Function to calculate total within-cluster sum of squares (wss)
wss <- function(k) {
  kmeans_result <- kmeans(numeric_data, centers = k, nstart = 10)  # You can adjust nstart based on your preference
  return(sum(kmeans_result$tot.withinss))
}

# Calculate the total within-cluster sum of squares for different values of k
k_values <- 1:10  # You can adjust the range of k values
wss_values <- sapply(k_values, wss)

# Plot the elbow curve
plot(k_values, wss_values, type = "b", pch = 19, frame = FALSE, 
     xlab = "Number of Clusters (k)", ylab = "Total Within-Cluster Sum of Squares (WSS)",
     main = "Elbow Method")

# Adding a line to indicate the "elbow"
abline(v = which(diff(wss_values) == max(diff(wss_values))) + 1, col = "red")


```

```{r}
gap_stat <- clusGap(numeric_data,
                    FUN = kmeans,
                    nstart = 25,
                    K.max = 10,
                    B = 50)

#plot number of clusters vs. gap statistic
fviz_gap_stat(gap_stat)
```

## Classification:

### factor the data

```{r}

data <- Preprocessed_dataset

data$will_go_to_college <- factor(data$will_go_to_college, levels = c("1", "0"), labels = c("1", "0"))
data$residence <- factor(data$residence, levels = c("1", "0"), labels = c("1", "0"))
data$gender <- factor(data$gender, levels = c("1", "0"), labels = c("1", "0"))
data$parent_was_in_college <- factor(data$parent_was_in_college, levels = c("1", "0"), labels = c("1", "0"))
data$interest <- factor(data$interest, levels = c("4","3","2","1", "0"), labels = c("4","3","2","1", "0"))
data$type_school <- factor(data$type_school, levels = c("1", "0"), labels = c("1", "0"))
data$school_accreditation <- factor(data$school_accreditation, levels = c("1", "0"), labels = c("1", "0"))

data$average_grades <- factor(data$average_grades, levels = c("+A", "A","+B","B","+C","C","+D","D","F"), labels = c("+A", "A","+B","B","+C","C","+D","D","F"))
str(data)

```

### balanced or imbalanced

```{r}

library(tidyverse) 
library(caret)
data$will_go_to_college<- as.numeric(data$will_go_to_college)
hist(data$will_go_to_college,col="coral")
prop.table(table(data$will_go_to_college))
```

we want to confirm that the distribution between the two label data is not too much different. Because imbalanced datasets can lead to imbalanced accuracy.

Fortunately ,our data is balanced

### partition method

We opted for cross-validation as our partition method owing to the constraints posed by limited data availability. To ensure robustness in our evaluation, we employed three distinct values for k folds 2, 3, and 4. we chose small k folds because of our small data

### c4.5

```{r}
library(caret)
library(rpart)
library(dplyr)
library(rpart.plot)


set.seed(123)
fold <- c(2, 3, 4)  # Values of xval (number of folds) to try

for (fold in fold) {
  cat("fold =", fold, "\n")
  trctrl <- trainControl(method = "cv", number = fold, savePredictions = TRUE)
  c45_fit <- train(will_go_to_college ~ ., data = data, method = "J48", trControl = trctrl)
   print(c45_fit$finalModel)
  # Get predicted values
  predictions <- predict(c45_fit, newdata = data)
  
  # Create confusion matrix
  confusion_matrix <- confusionMatrix(predictions, data$will_go_to_college)
  
  # Print the confusion matrix
  print(confusion_matrix)
  
  # Print accuracy for each fold
  pred <- c45_fit$pred
  pred$equal <- ifelse(pred$pred == pred$obs, 1, 0)
  eachfold <- pred %>%
    group_by(Resample) %>%
    summarise_at(vars(equal),
                 list(Accuracy = mean))
  print(eachfold)
    # Plot decision tree
plot(c45_fit$finalModel, main = fold)
  
}
```

The gain ratio consistently favors unbalanced splits, as demonstrated by its selection of "Parent salary" as the root for all three trees even though it's shown in the tree "average grades" as the root but the split point's that all the value in one diraction . In this configuration, one partition is notably smaller than the others, and the feature exhibits a higher number of distinct values. Despite the fact that the node corresponding to "Parent age", "parent was in collage" is not pure, the resulting trees exhibit impressive accuracy levels, all surpassing 94%

### cart

```{r}
library(caret)
library(rpart)
library(rpart.plot)



set.seed(1234) # Random seed
fold <- c(2, 3, 4)  # Values of xval (number of folds) to try

for (fold in fold) {
cat("fold"=fold)
trctrl <- trainControl(method = "cv", number = fold, savePredictions = TRUE)
dt_fit <- train(factor(will_go_to_college) ~ ., data = data, method = "rpart1SE", trControl = trctrl)

 # Plot decision tree
  rpart.plot(dt_fit$finalModel)
  
# Get predicted values
predictions <- predict(dt_fit, newdata = data)

# Create confusion matrix
confusion_matrix <- confusionMatrix(predictions, data$will_go_to_college)

# Print the confusion matrix
print(confusion_matrix)

#print each flod
pred <- dt_fit$pred
pred$equal <- ifelse(pred$pred == pred$obs, 1,0)
eachfold <- pred %>%                                        
  group_by(Resample) %>%                         
  summarise_at(vars(equal),                     
               list(Accuracy = mean))              
print(eachfold  )

  }
```

averages grade exhibits the smallest Gini index binary split, signifying a substantial reduction in impurity. Hence, it is chosen as the splitting attribute. Conversely, attributes such as 'type_school,' 'school_accreditation,' 'gender,' 'parent_age,' and 'parent_was_in_college' yield minimal impurity reduction, leading to their exclusion from the tree. The dataset's balanced class labels and marginal differences in accuracy across folds result in consistent tree structures, as evidenced by the identical trees in all folds. For further details, refer to the [index]. Overall, the model attains an 86% accuracy, emphasizing its effectiveness.

### ID3

```{r}
library(caret)
library(partykit)
library(dplyr)

fold <- c(4, 3, 2)  # Values of xval (number of folds) to try

for (fold in fold) {
  cat("fold =", fold, "\n")
  trctrl <- trainControl(method = "cv", number = fold, savePredictions = TRUE)
  
  id_fit <- train(factor(will_go_to_college) ~ ., data = data, method = "ctree", trControl = trctrl)
   print(id_fit$finalModel)
  # Get predicted values
  predictions <- predict(id_fit, newdata = data)
  
  # Create confusion matrix
  confusion_matrix <- confusionMatrix(predictions, data$will_go_to_college)
  
  # Print the confusion matrix
  print(confusion_matrix)
  
  # Print accuracy for each fold
  pred <- id_fit$pred
  pred$equal <- ifelse(pred$pred == pred$obs, 1, 0)
  eachfold <- pred %>%
    group_by(Resample) %>%
    summarise_at(vars(equal), list(Accuracy = mean))
  print(eachfold)

  # Plot decision tree
  plot(id_fit$finalModel, main = paste("Decision Tree (Fold", fold, ")"))
  
}
```

Additional insights reveal that attributes such as school accreditation, parent was in collage contribute to high impurity. In contrast, Parent salary is chosen as the root due to its high purity. Given the balanced class labels in our dataset and minimal variations in accuracy across folds, the result yields consistent tree structures, with only two distinct trees observed for all folds. For further details, please refer to the [index]. The overall accuracy consistently surpasses 86%, affirming the model's efficacy

### final analysis

The C4.5 model emerged as the top-performing evaluation model, achieving an impressive accuracy rate of 94% to 97%. It was followed by the ID3 model, which demonstrated slightly lower accuracy ranging from 86% to 89%. Lastly, the cart model exhibited an accuracy rate of 86%.

the C4.5 gave better result than ID3 and Cart because they both are biased to multivalued where C4.5 normalized parent salary and house area which are multivalue attributes

C4.5 and ID3 models, the parent's salary served as the root feature, indicating that the financial circumstances of the student are a crucial factor for contemporary universities.

## index

### all trees of C4.5

```{r}
library(caret)
library(rpart)
library(dplyr)
library(rpart.plot)

folds <- c(2, 3, 4)  # Values of xval (number of folds) to try

for(folds in folds){
   cat("fold =", fold_val, "\n")
for (fold_val in 1:folds) {

  
  trctrl <- trainControl(method = "cv", number = fold_val, savePredictions = TRUE)
  c45_fit <- train(factor(will_go_to_college) ~ ., data = data, method = "J48", trControl = trctrl)
  
   plot(c45_fit$finalModel, main = paste("Decision Tree - Fold", fold_val))
}
  # Get predicted values
  predictions <- predict(c45_fit, newdata = data)
  
  # Create confusion matrix
  confusion_matrix <- confusionMatrix(predictions, data$will_go_to_college)
  
  # Print the confusion matrix
  print(confusion_matrix)
  
  # Print accuracy for each fold
  pred <- c45_fit$pred
  pred$equal <- ifelse(pred$pred == pred$obs, 1, 0)
  eachfold <- pred %>%
    group_by(Resample) %>%
    summarise_at(vars(equal),
                 list(Accuracy = mean))
  print(eachfold)
  
  # Plot decision tree for each fold
 

}
```

### all trees of ID3

```{r}
library(caret)
library(partykit)
library(dplyr)

fold <- c(4, 3, 2)  # Values of xval (number of folds) to try

for (fold in fold) {
   cat("fold =", fold, "\n")
  for(i in 1:fold){
  trctrl <- trainControl(method = "cv", number = fold, savePredictions = TRUE)
  
  id_fit <- train(factor(will_go_to_college) ~ ., data = data, method = "ctree", trControl = trctrl, subset = trctrl$indexes[[i]])
  
    plot(id_fit$finalModel, main = paste("Decision Tree (Fold", fold,"-", i, ")"))
}
   
  # Get predicted values
  predictions <- predict(id_fit, newdata = data)
  
  # Create confusion matrix
  confusion_matrix <- confusionMatrix(predictions, data$will_go_to_college)
  
  # Print the confusion matrix
  print(confusion_matrix)
  
  # Print accuracy for each fold
  pred <- id_fit$pred
  pred$equal <- ifelse(pred$pred == pred$obs, 1, 0)
  eachfold <- pred %>%
    group_by(Resample) %>%
    summarise_at(vars(equal), list(Accuracy = mean))
  print(eachfold)

  # Plot decision tree


}
```

### all trees of Cart

```{r}
library(caret)
library(rpart)
library(rpart.plot)

set.seed(123) # Random seed
fold <- c(2, 3, 4)  # Values of xval (number of folds) to try

for (fold in fold) {
  cat("fold"=fold)
  for(i in 1:fold){

trctrl <- trainControl(method = "cv", number = fold, savePredictions = TRUE)
dt_fit <- train(factor(will_go_to_college) ~ ., data = data, method = "rpart1SE", trControl = trctrl)

 # Plot decision tree
  cat("Decision Tree - Fold", i, ":\n")
  rpart.plot(dt_fit$finalModel)
  
}
# Get predicted values
predictions <- predict(dt_fit, newdata = data)

# Create confusion matrix
confusion_matrix <- confusionMatrix(predictions, data$will_go_to_college)

# Print the confusion matrix
print(confusion_matrix)

#print each flod
pred <- dt_fit$pred
pred$equal <- ifelse(pred$pred == pred$obs, 1,0)
eachfold <- pred %>%                                        
  group_by(Resample) %>%                         
  summarise_at(vars(equal),                     
               list(Accuracy = mean))              
print(eachfold  )

  }

```
